centipede/util.h - third_party/github/google/fuzztest - Git at Google

 // Copyright 2022 The Centipede Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef THIRD_PARTY_CENTIPEDE_UTIL_H_
 #define THIRD_PARTY_CENTIPEDE_UTIL_H_

 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <string>
 #include <string_view>
 #include <vector>

 #include "absl/types/span.h"
 #include "./centipede/defs.h"
 #include "./centipede/feature.h"
 #include "./centipede/logging.h"

 namespace centipede {

 // Returns a printable hash of a byte array. Currently sha1 is used.
 std::string Hash(absl::Span<const uint8_t> span);
 // Same as above, but for std::string_view.
 std::string Hash(std::string_view str);
 // Hashes are always this many bytes.
 inline constexpr size_t kHashLen = 40;
 // Returns the hash of the contents of the file `file_path`.
 std::string HashOfFileContents(std::string_view file_path);
 // Returns a printable string representing at most `max_len` bytes of `data`.
 std::string AsString(const ByteArray &data, size_t max_len = 16);
 // Reads from a local file `file_path` into `data`.
 // Crashes on any error.
 void ReadFromLocalFile(std::string_view file_path, ByteArray &data);
 // Same as above.
 void ReadFromLocalFile(std::string_view file_path, std::string &data);
 // Same as above but for FeatureVec.
 // Crashes if the number of read bytes is not 0 mod sizeof(feature_t).
 void ReadFromLocalFile(std::string_view file_path, FeatureVec &data);
 // Same as above but for vector<uint32_t>.
 void ReadFromLocalFile(std::string_view file_path, std::vector<uint32_t> &data);
 // Writes the contents of `data` to a local file `file_path`.
 // Crashes on any error.
 void WriteToLocalFile(std::string_view file_path,
                       absl::Span<const uint8_t> data);
 // Same as above.
 void WriteToLocalFile(std::string_view file_path, std::string_view data);
 // Same as above but for FeatureVec.
 void WriteToLocalFile(std::string_view file_path, const FeatureVec &data);
 // Writes `data` to `dir_path`/Hash(`data`). Does nothing if `dir_path.empty()`.
 void WriteToLocalHashedFileInDir(std::string_view dir_path,
                                  absl::Span<const uint8_t> data);
 // Returns a path string suitable to create a temporary local directory.
 // Will return the same value every time it is called within one thread,
 // but different values for different threads and difference processes.
 std::string TemporaryLocalDirPath();

 // Creates an empty dir `path` and schedules it for deletion at exit.
 // Uses atexit(), and so the removal will not happen if exit() is bypassed.
 // This function is supposed to be called only on paths created by
 // TemporaryLocalDirPath().
 void CreateLocalDirRemovedAtExit(std::string_view path);

 // Requests that the process exits soon, with `exit_code`.
 // `exit_code` must be non-zero (!= EXIT_SUCCESS).
 // Async-signal-safe.
 void RequestEarlyExit(int exit_code);
 // Returns true iff RequestEarlyExit() was called.
 bool EarlyExitRequested();
 // Returns the value most recently passed to RequestEarlyExit()
 // or 0 if RequestEarlyExit() was not called.
 int ExitCode();

 // If `seed` != 0, returns `seed`, otherwise returns a random number
 // based on time, pid, tid, etc.
 size_t GetRandomSeed(size_t seed);

 // Returns a string that starts with `prefix` and that uniquely identifies
 // the caller's process and thread.
 std::string ProcessAndThreadUniqueID(std::string_view prefix);

 // Computes a random subset of `set` that needs to be
 // removed to reach `target_size` non-zero weights in the set.
 // `set` is an array of weights, some of which could be zero.
 //
 // Subsets with smaller combined weight are more likely to be returned.
 // The return value is a sorted vector of indices of elements of `set`
 // that need to be removed, including those with zero weights.
 // Randomness is retrieved from `rng`.
 //
 // Example:  set = {20, 10, 0, 40, 50}
 //   For target_size >= 4, the return value will be {2}, i.e. indices of all 0s.
 //   For target_size == 3, the return value will be one of
 //   {1, 2}, {0, 2}, {2, 3}, {2, 4}, i.e. the indices of 0s and one more index.
 //   {1, 2} is more likely than {2, 4} because set[1] < set[4].
 //
 // This is a flavour of https://en.wikipedia.org/wiki/Reservoir_sampling
 // with two differences:
 //   * Elements with weight 0 are unconditionally included.
 //   * We choose which elements to remove instead of which elements to pick.
 //     We use this inverted algorithm because in a typical use case
 //     `target_size` is just slightly smaller than set.size().
 std::vector<size_t> RandomWeightedSubset(absl::Span<const uint64_t> set,
                                          size_t target_size, Rng &rng);

 // Removes all elements from `set` whose indices are found in `subset_indices`.
 // `subset_indices` is a sorted vector.
 template <typename T>
 void RemoveSubset(const std::vector<size_t> &subset_indices,
                   std::vector<T> &set) {
   size_t pos_to_write = 0;
   for (size_t i = 0, n = set.size(); i < n; i++) {
     // If subset_indices.size() is k, this loop's complexity is O(n*log(k)).
     // We can do it in O(n+k) with a bit more code, but this loop is not
     // expected to be hot. Besides, k would typically be small.
     if (!std::binary_search(subset_indices.begin(), subset_indices.end(), i))
       std::swap(set[pos_to_write++], set[i]);
   }
   set.resize(pos_to_write);
 }

 // Adds a prefix and a postfix to `data` such that the result can be
 // appended to another such packed data and then the operation can be reversed.
 // The purpose is to allow appending blobs of data to a (possibly remote) file
 // such that when reading this file we can separate the blobs.
 // TODO(kcc): [impl] is there a lightweight equivalent in the open-source world?
 //  tar sounds too heavy.
 // TODO(kcc): [impl] investigate https://github.com/google/riegeli.
 ByteArray PackBytesForAppendFile(const ByteArray &data);
 // Unpacks `packed_data` into `unpacked` and `hashes`.
 // `packed_data` is multiple data packed by PackBytesForAppendFile()
 // and merged together.
 // `unpacked` or `hashes` can be nullptr.
 void UnpackBytesFromAppendFile(const ByteArray &packed_data,
                                std::vector<ByteArray> *unpacked,
                                std::vector<std::string> *hashes = nullptr);
 // Append the bytes from 'hash' to 'ba'.
 void AppendHashToArray(ByteArray &ba, std::string_view hash);
 // Reverse to AppendHashToArray.
 std::string ExtractHashFromArray(ByteArray &ba);

 // Pack {features, Hash(data)} into a byte array.
 ByteArray PackFeaturesAndHash(const ByteArray &data,
                               const FeatureVec &features);

 // Parses `dictionary_text` representing an AFL/libFuzzer dictionary.
 // https://github.com/google/AFL/blob/master/dictionaries/README.dictionaries
 // https://llvm.org/docs/LibFuzzer.html#dictionaries
 // Fills in `dictionary_entries` with byte sequences from the dictionary.
 // Returns true iff parsing completes successfully.
 bool ParseAFLDictionary(std::string_view dictionary_text,
                         std::vector<ByteArray> &dictionary_entries);

 // Maps `size` bytes with `mmap(NO_RESERVE)` or equivalent, returns the result.
 // CHECK-fails on error.
 // The resulting memory is unreserved, and will be zero-initialized on first
 // access.
 uint8_t *MmapNoReserve(size_t size);

 // Unmaps memory returned by `MmapNoReserve`().
 // CHECK-fails on error.
 void Munmap(uint8_t *ptr, size_t size);

 // Fixed size array allocated/deallocated with MmapNoReserve/Munmap.
 // operator[] checks indices for out-of-bounds.
 template <size_t kSize>
 class MmapNoReserveArray {
  public:
   MmapNoReserveArray() : array_(MmapNoReserve(kSize)) {}
   ~MmapNoReserveArray() { Munmap(array_, kSize); }
   MmapNoReserveArray(const MmapNoReserveArray &) = delete;
   MmapNoReserveArray &operator=(const MmapNoReserveArray &) = delete;
   MmapNoReserveArray(MmapNoReserveArray &&) = delete;
   MmapNoReserveArray &operator=(MmapNoReserveArray &&) = delete;
   uint8_t operator[](size_t i) const {
     CHECK_LT(i, kSize);
     return array_[i];
   }
   uint8_t &operator[](size_t i) {
     CHECK_LT(i, kSize);
     return array_[i];
   }

  private:
   uint8_t *array_;
 };

 }  // namespace centipede

 #endif  // THIRD_PARTY_CENTIPEDE_UTIL_H_
	// Copyright 2022 The Centipede Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef THIRD_PARTY_CENTIPEDE_UTIL_H_
	#define THIRD_PARTY_CENTIPEDE_UTIL_H_

	#include <algorithm>
	#include <cstddef>
	#include <cstdint>
	#include <string>
	#include <string_view>
	#include <vector>

	#include "absl/types/span.h"
	#include "./centipede/defs.h"
	#include "./centipede/feature.h"
	#include "./centipede/logging.h"

	namespace centipede {

	// Returns a printable hash of a byte array. Currently sha1 is used.
	std::string Hash(absl::Span<const uint8_t> span);
	// Same as above, but for std::string_view.
	std::string Hash(std::string_view str);
	// Hashes are always this many bytes.
	inline constexpr size_t kHashLen = 40;
	// Returns the hash of the contents of the file `file_path`.
	std::string HashOfFileContents(std::string_view file_path);
	// Returns a printable string representing at most `max_len` bytes of `data`.
	std::string AsString(const ByteArray &data, size_t max_len = 16);
	// Reads from a local file `file_path` into `data`.
	// Crashes on any error.
	void ReadFromLocalFile(std::string_view file_path, ByteArray &data);
	// Same as above.
	void ReadFromLocalFile(std::string_view file_path, std::string &data);
	// Same as above but for FeatureVec.
	// Crashes if the number of read bytes is not 0 mod sizeof(feature_t).
	void ReadFromLocalFile(std::string_view file_path, FeatureVec &data);
	// Same as above but for vector<uint32_t>.
	void ReadFromLocalFile(std::string_view file_path, std::vector<uint32_t> &data);
	// Writes the contents of `data` to a local file `file_path`.
	// Crashes on any error.
	void WriteToLocalFile(std::string_view file_path,
	absl::Span<const uint8_t> data);
	// Same as above.
	void WriteToLocalFile(std::string_view file_path, std::string_view data);
	// Same as above but for FeatureVec.
	void WriteToLocalFile(std::string_view file_path, const FeatureVec &data);
	// Writes `data` to `dir_path`/Hash(`data`). Does nothing if `dir_path.empty()`.
	void WriteToLocalHashedFileInDir(std::string_view dir_path,
	absl::Span<const uint8_t> data);
	// Returns a path string suitable to create a temporary local directory.
	// Will return the same value every time it is called within one thread,
	// but different values for different threads and difference processes.
	std::string TemporaryLocalDirPath();

	// Creates an empty dir `path` and schedules it for deletion at exit.
	// Uses atexit(), and so the removal will not happen if exit() is bypassed.
	// This function is supposed to be called only on paths created by
	// TemporaryLocalDirPath().
	void CreateLocalDirRemovedAtExit(std::string_view path);

	// Requests that the process exits soon, with `exit_code`.
	// `exit_code` must be non-zero (!= EXIT_SUCCESS).
	// Async-signal-safe.
	void RequestEarlyExit(int exit_code);
	// Returns true iff RequestEarlyExit() was called.
	bool EarlyExitRequested();
	// Returns the value most recently passed to RequestEarlyExit()
	// or 0 if RequestEarlyExit() was not called.
	int ExitCode();

	// If `seed` != 0, returns `seed`, otherwise returns a random number
	// based on time, pid, tid, etc.
	size_t GetRandomSeed(size_t seed);

	// Returns a string that starts with `prefix` and that uniquely identifies
	// the caller's process and thread.
	std::string ProcessAndThreadUniqueID(std::string_view prefix);

	// Computes a random subset of `set` that needs to be
	// removed to reach `target_size` non-zero weights in the set.
	// `set` is an array of weights, some of which could be zero.
	//
	// Subsets with smaller combined weight are more likely to be returned.
	// The return value is a sorted vector of indices of elements of `set`
	// that need to be removed, including those with zero weights.
	// Randomness is retrieved from `rng`.
	//
	// Example: set = {20, 10, 0, 40, 50}
	// For target_size >= 4, the return value will be {2}, i.e. indices of all 0s.
	// For target_size == 3, the return value will be one of
	// {1, 2}, {0, 2}, {2, 3}, {2, 4}, i.e. the indices of 0s and one more index.
	// {1, 2} is more likely than {2, 4} because set[1] < set[4].
	//
	// This is a flavour of https://en.wikipedia.org/wiki/Reservoir_sampling
	// with two differences:
	// * Elements with weight 0 are unconditionally included.
	// * We choose which elements to remove instead of which elements to pick.
	// We use this inverted algorithm because in a typical use case
	// `target_size` is just slightly smaller than set.size().
	std::vector<size_t> RandomWeightedSubset(absl::Span<const uint64_t> set,
	size_t target_size, Rng &rng);

	// Removes all elements from `set` whose indices are found in `subset_indices`.
	// `subset_indices` is a sorted vector.
	template <typename T>
	void RemoveSubset(const std::vector<size_t> &subset_indices,
	std::vector<T> &set) {
	size_t pos_to_write = 0;
	for (size_t i = 0, n = set.size(); i < n; i++) {
	// If subset_indices.size() is k, this loop's complexity is O(n*log(k)).
	// We can do it in O(n+k) with a bit more code, but this loop is not
	// expected to be hot. Besides, k would typically be small.
	if (!std::binary_search(subset_indices.begin(), subset_indices.end(), i))
	std::swap(set[pos_to_write++], set[i]);
	}
	set.resize(pos_to_write);
	}

	// Adds a prefix and a postfix to `data` such that the result can be
	// appended to another such packed data and then the operation can be reversed.
	// The purpose is to allow appending blobs of data to a (possibly remote) file
	// such that when reading this file we can separate the blobs.
	// TODO(kcc): [impl] is there a lightweight equivalent in the open-source world?
	// tar sounds too heavy.
	// TODO(kcc): [impl] investigate https://github.com/google/riegeli.
	ByteArray PackBytesForAppendFile(const ByteArray &data);
	// Unpacks `packed_data` into `unpacked` and `hashes`.
	// `packed_data` is multiple data packed by PackBytesForAppendFile()
	// and merged together.
	// `unpacked` or `hashes` can be nullptr.
	void UnpackBytesFromAppendFile(const ByteArray &packed_data,
	std::vector<ByteArray> *unpacked,
	std::vector<std::string> *hashes = nullptr);
	// Append the bytes from 'hash' to 'ba'.
	void AppendHashToArray(ByteArray &ba, std::string_view hash);
	// Reverse to AppendHashToArray.
	std::string ExtractHashFromArray(ByteArray &ba);

	// Pack {features, Hash(data)} into a byte array.
	ByteArray PackFeaturesAndHash(const ByteArray &data,
	const FeatureVec &features);

	// Parses `dictionary_text` representing an AFL/libFuzzer dictionary.
	// https://github.com/google/AFL/blob/master/dictionaries/README.dictionaries
	// https://llvm.org/docs/LibFuzzer.html#dictionaries
	// Fills in `dictionary_entries` with byte sequences from the dictionary.
	// Returns true iff parsing completes successfully.
	bool ParseAFLDictionary(std::string_view dictionary_text,
	std::vector<ByteArray> &dictionary_entries);

	// Maps `size` bytes with `mmap(NO_RESERVE)` or equivalent, returns the result.
	// CHECK-fails on error.
	// The resulting memory is unreserved, and will be zero-initialized on first
	// access.
	uint8_t *MmapNoReserve(size_t size);

	// Unmaps memory returned by `MmapNoReserve`().
	// CHECK-fails on error.
	void Munmap(uint8_t *ptr, size_t size);

	// Fixed size array allocated/deallocated with MmapNoReserve/Munmap.
	// operator[] checks indices for out-of-bounds.
	template <size_t kSize>
	class MmapNoReserveArray {
	public:
	MmapNoReserveArray() : array_(MmapNoReserve(kSize)) {}
	~MmapNoReserveArray() { Munmap(array_, kSize); }
	MmapNoReserveArray(const MmapNoReserveArray &) = delete;
	MmapNoReserveArray &operator=(const MmapNoReserveArray &) = delete;
	MmapNoReserveArray(MmapNoReserveArray &&) = delete;
	MmapNoReserveArray &operator=(MmapNoReserveArray &&) = delete;
	uint8_t operator[](size_t i) const {
	CHECK_LT(i, kSize);
	return array_[i];
	}
	uint8_t &operator[](size_t i) {
	CHECK_LT(i, kSize);
	return array_[i];
	}

	private:
	uint8_t *array_;
	};

	} // namespace centipede

	#endif // THIRD_PARTY_CENTIPEDE_UTIL_H_