| // Copyright 2022 The Centipede Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef THIRD_PARTY_CENTIPEDE_UTIL_H_ |
| #define THIRD_PARTY_CENTIPEDE_UTIL_H_ |
| |
| #include <algorithm> |
| #include <cstddef> |
| #include <cstdint> |
| #include <string> |
| #include <string_view> |
| #include <vector> |
| |
| #include "absl/types/span.h" |
| #include "./centipede/defs.h" |
| #include "./centipede/feature.h" |
| #include "./centipede/logging.h" |
| |
| namespace centipede { |
| |
| // Returns a printable hash of a byte array. Currently sha1 is used. |
| std::string Hash(absl::Span<const uint8_t> span); |
| // Same as above, but for std::string_view. |
| std::string Hash(std::string_view str); |
| // Hashes are always this many bytes. |
| inline constexpr size_t kHashLen = 40; |
| // Returns the hash of the contents of the file `file_path`. |
| std::string HashOfFileContents(std::string_view file_path); |
| // Returns a printable string representing at most `max_len` bytes of `data`. |
| std::string AsString(const ByteArray &data, size_t max_len = 16); |
| // Reads from a local file `file_path` into `data`. |
| // Crashes on any error. |
| void ReadFromLocalFile(std::string_view file_path, ByteArray &data); |
| // Same as above. |
| void ReadFromLocalFile(std::string_view file_path, std::string &data); |
| // Same as above but for FeatureVec. |
| // Crashes if the number of read bytes is not 0 mod sizeof(feature_t). |
| void ReadFromLocalFile(std::string_view file_path, FeatureVec &data); |
| // Same as above but for vector<uint32_t>. |
| void ReadFromLocalFile(std::string_view file_path, std::vector<uint32_t> &data); |
| // Writes the contents of `data` to a local file `file_path`. |
| // Crashes on any error. |
| void WriteToLocalFile(std::string_view file_path, |
| absl::Span<const uint8_t> data); |
| // Same as above. |
| void WriteToLocalFile(std::string_view file_path, std::string_view data); |
| // Same as above but for FeatureVec. |
| void WriteToLocalFile(std::string_view file_path, const FeatureVec &data); |
| // Writes `data` to `dir_path`/Hash(`data`). Does nothing if `dir_path.empty()`. |
| void WriteToLocalHashedFileInDir(std::string_view dir_path, |
| absl::Span<const uint8_t> data); |
| // Returns a path string suitable to create a temporary local directory. |
| // Will return the same value every time it is called within one thread, |
| // but different values for different threads and difference processes. |
| std::string TemporaryLocalDirPath(); |
| |
| // Creates an empty dir `path` and schedules it for deletion at exit. |
| // Uses atexit(), and so the removal will not happen if exit() is bypassed. |
| // This function is supposed to be called only on paths created by |
| // TemporaryLocalDirPath(). |
| void CreateLocalDirRemovedAtExit(std::string_view path); |
| |
| // Requests that the process exits soon, with `exit_code`. |
| // `exit_code` must be non-zero (!= EXIT_SUCCESS). |
| // Async-signal-safe. |
| void RequestEarlyExit(int exit_code); |
| // Returns true iff RequestEarlyExit() was called. |
| bool EarlyExitRequested(); |
| // Returns the value most recently passed to RequestEarlyExit() |
| // or 0 if RequestEarlyExit() was not called. |
| int ExitCode(); |
| |
| // If `seed` != 0, returns `seed`, otherwise returns a random number |
| // based on time, pid, tid, etc. |
| size_t GetRandomSeed(size_t seed); |
| |
| // Returns a string that starts with `prefix` and that uniquely identifies |
| // the caller's process and thread. |
| std::string ProcessAndThreadUniqueID(std::string_view prefix); |
| |
| // Computes a random subset of `set` that needs to be |
| // removed to reach `target_size` non-zero weights in the set. |
| // `set` is an array of weights, some of which could be zero. |
| // |
| // Subsets with smaller combined weight are more likely to be returned. |
| // The return value is a sorted vector of indices of elements of `set` |
| // that need to be removed, including those with zero weights. |
| // Randomness is retrieved from `rng`. |
| // |
| // Example: set = {20, 10, 0, 40, 50} |
| // For target_size >= 4, the return value will be {2}, i.e. indices of all 0s. |
| // For target_size == 3, the return value will be one of |
| // {1, 2}, {0, 2}, {2, 3}, {2, 4}, i.e. the indices of 0s and one more index. |
| // {1, 2} is more likely than {2, 4} because set[1] < set[4]. |
| // |
| // This is a flavour of https://en.wikipedia.org/wiki/Reservoir_sampling |
| // with two differences: |
| // * Elements with weight 0 are unconditionally included. |
| // * We choose which elements to remove instead of which elements to pick. |
| // We use this inverted algorithm because in a typical use case |
| // `target_size` is just slightly smaller than set.size(). |
| std::vector<size_t> RandomWeightedSubset(absl::Span<const uint64_t> set, |
| size_t target_size, Rng &rng); |
| |
| // Removes all elements from `set` whose indices are found in `subset_indices`. |
| // `subset_indices` is a sorted vector. |
| template <typename T> |
| void RemoveSubset(const std::vector<size_t> &subset_indices, |
| std::vector<T> &set) { |
| size_t pos_to_write = 0; |
| for (size_t i = 0, n = set.size(); i < n; i++) { |
| // If subset_indices.size() is k, this loop's complexity is O(n*log(k)). |
| // We can do it in O(n+k) with a bit more code, but this loop is not |
| // expected to be hot. Besides, k would typically be small. |
| if (!std::binary_search(subset_indices.begin(), subset_indices.end(), i)) |
| std::swap(set[pos_to_write++], set[i]); |
| } |
| set.resize(pos_to_write); |
| } |
| |
| // Adds a prefix and a postfix to `data` such that the result can be |
| // appended to another such packed data and then the operation can be reversed. |
| // The purpose is to allow appending blobs of data to a (possibly remote) file |
| // such that when reading this file we can separate the blobs. |
| // TODO(kcc): [impl] is there a lightweight equivalent in the open-source world? |
| // tar sounds too heavy. |
| // TODO(kcc): [impl] investigate https://github.com/google/riegeli. |
| ByteArray PackBytesForAppendFile(const ByteArray &data); |
| // Unpacks `packed_data` into `unpacked` and `hashes`. |
| // `packed_data` is multiple data packed by PackBytesForAppendFile() |
| // and merged together. |
| // `unpacked` or `hashes` can be nullptr. |
| void UnpackBytesFromAppendFile(const ByteArray &packed_data, |
| std::vector<ByteArray> *unpacked, |
| std::vector<std::string> *hashes = nullptr); |
| // Append the bytes from 'hash' to 'ba'. |
| void AppendHashToArray(ByteArray &ba, std::string_view hash); |
| // Reverse to AppendHashToArray. |
| std::string ExtractHashFromArray(ByteArray &ba); |
| |
| // Pack {features, Hash(data)} into a byte array. |
| ByteArray PackFeaturesAndHash(const ByteArray &data, |
| const FeatureVec &features); |
| |
| // Parses `dictionary_text` representing an AFL/libFuzzer dictionary. |
| // https://github.com/google/AFL/blob/master/dictionaries/README.dictionaries |
| // https://llvm.org/docs/LibFuzzer.html#dictionaries |
| // Fills in `dictionary_entries` with byte sequences from the dictionary. |
| // Returns true iff parsing completes successfully. |
| bool ParseAFLDictionary(std::string_view dictionary_text, |
| std::vector<ByteArray> &dictionary_entries); |
| |
| // Maps `size` bytes with `mmap(NO_RESERVE)` or equivalent, returns the result. |
| // CHECK-fails on error. |
| // The resulting memory is unreserved, and will be zero-initialized on first |
| // access. |
| uint8_t *MmapNoReserve(size_t size); |
| |
| // Unmaps memory returned by `MmapNoReserve`(). |
| // CHECK-fails on error. |
| void Munmap(uint8_t *ptr, size_t size); |
| |
| // Fixed size array allocated/deallocated with MmapNoReserve/Munmap. |
| // operator[] checks indices for out-of-bounds. |
| template <size_t kSize> |
| class MmapNoReserveArray { |
| public: |
| MmapNoReserveArray() : array_(MmapNoReserve(kSize)) {} |
| ~MmapNoReserveArray() { Munmap(array_, kSize); } |
| MmapNoReserveArray(const MmapNoReserveArray &) = delete; |
| MmapNoReserveArray &operator=(const MmapNoReserveArray &) = delete; |
| MmapNoReserveArray(MmapNoReserveArray &&) = delete; |
| MmapNoReserveArray &operator=(MmapNoReserveArray &&) = delete; |
| uint8_t operator[](size_t i) const { |
| CHECK_LT(i, kSize); |
| return array_[i]; |
| } |
| uint8_t &operator[](size_t i) { |
| CHECK_LT(i, kSize); |
| return array_[i]; |
| } |
| |
| private: |
| uint8_t *array_; |
| }; |
| |
| } // namespace centipede |
| |
| #endif // THIRD_PARTY_CENTIPEDE_UTIL_H_ |