blob: 0630af23513fbacc8adb1955cc7dfcf278797d4c [file] [log] [blame]
// Copyright 2022 The Centipede Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef THIRD_PARTY_CENTIPEDE_UTIL_H_
#define THIRD_PARTY_CENTIPEDE_UTIL_H_
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <string>
#include <string_view>
#include <vector>
#include "absl/types/span.h"
#include "./centipede/defs.h"
#include "./centipede/feature.h"
#include "./centipede/logging.h"
namespace centipede {
// Returns a printable hash of a byte array. Currently sha1 is used.
std::string Hash(absl::Span<const uint8_t> span);
// Same as above, but for std::string_view.
std::string Hash(std::string_view str);
// Hashes are always this many bytes.
inline constexpr size_t kHashLen = 40;
// Returns the hash of the contents of the file `file_path`.
std::string HashOfFileContents(std::string_view file_path);
// Returns a printable string representing at most `max_len` bytes of `data`.
std::string AsString(const ByteArray &data, size_t max_len = 16);
// Reads from a local file `file_path` into `data`.
// Crashes on any error.
void ReadFromLocalFile(std::string_view file_path, ByteArray &data);
// Same as above.
void ReadFromLocalFile(std::string_view file_path, std::string &data);
// Same as above but for FeatureVec.
// Crashes if the number of read bytes is not 0 mod sizeof(feature_t).
void ReadFromLocalFile(std::string_view file_path, FeatureVec &data);
// Same as above but for vector<uint32_t>.
void ReadFromLocalFile(std::string_view file_path, std::vector<uint32_t> &data);
// Writes the contents of `data` to a local file `file_path`.
// Crashes on any error.
void WriteToLocalFile(std::string_view file_path,
absl::Span<const uint8_t> data);
// Same as above.
void WriteToLocalFile(std::string_view file_path, std::string_view data);
// Same as above but for FeatureVec.
void WriteToLocalFile(std::string_view file_path, const FeatureVec &data);
// Writes `data` to `dir_path`/Hash(`data`). Does nothing if `dir_path.empty()`.
void WriteToLocalHashedFileInDir(std::string_view dir_path,
absl::Span<const uint8_t> data);
// Returns a path string suitable to create a temporary local directory.
// Will return the same value every time it is called within one thread,
// but different values for different threads and difference processes.
std::string TemporaryLocalDirPath();
// Creates an empty dir `path` and schedules it for deletion at exit.
// Uses atexit(), and so the removal will not happen if exit() is bypassed.
// This function is supposed to be called only on paths created by
// TemporaryLocalDirPath().
void CreateLocalDirRemovedAtExit(std::string_view path);
// Requests that the process exits soon, with `exit_code`.
// `exit_code` must be non-zero (!= EXIT_SUCCESS).
// Async-signal-safe.
void RequestEarlyExit(int exit_code);
// Returns true iff RequestEarlyExit() was called.
bool EarlyExitRequested();
// Returns the value most recently passed to RequestEarlyExit()
// or 0 if RequestEarlyExit() was not called.
int ExitCode();
// If `seed` != 0, returns `seed`, otherwise returns a random number
// based on time, pid, tid, etc.
size_t GetRandomSeed(size_t seed);
// Returns a string that starts with `prefix` and that uniquely identifies
// the caller's process and thread.
std::string ProcessAndThreadUniqueID(std::string_view prefix);
// Computes a random subset of `set` that needs to be
// removed to reach `target_size` non-zero weights in the set.
// `set` is an array of weights, some of which could be zero.
//
// Subsets with smaller combined weight are more likely to be returned.
// The return value is a sorted vector of indices of elements of `set`
// that need to be removed, including those with zero weights.
// Randomness is retrieved from `rng`.
//
// Example: set = {20, 10, 0, 40, 50}
// For target_size >= 4, the return value will be {2}, i.e. indices of all 0s.
// For target_size == 3, the return value will be one of
// {1, 2}, {0, 2}, {2, 3}, {2, 4}, i.e. the indices of 0s and one more index.
// {1, 2} is more likely than {2, 4} because set[1] < set[4].
//
// This is a flavour of https://en.wikipedia.org/wiki/Reservoir_sampling
// with two differences:
// * Elements with weight 0 are unconditionally included.
// * We choose which elements to remove instead of which elements to pick.
// We use this inverted algorithm because in a typical use case
// `target_size` is just slightly smaller than set.size().
std::vector<size_t> RandomWeightedSubset(absl::Span<const uint64_t> set,
size_t target_size, Rng &rng);
// Removes all elements from `set` whose indices are found in `subset_indices`.
// `subset_indices` is a sorted vector.
template <typename T>
void RemoveSubset(const std::vector<size_t> &subset_indices,
std::vector<T> &set) {
size_t pos_to_write = 0;
for (size_t i = 0, n = set.size(); i < n; i++) {
// If subset_indices.size() is k, this loop's complexity is O(n*log(k)).
// We can do it in O(n+k) with a bit more code, but this loop is not
// expected to be hot. Besides, k would typically be small.
if (!std::binary_search(subset_indices.begin(), subset_indices.end(), i))
std::swap(set[pos_to_write++], set[i]);
}
set.resize(pos_to_write);
}
// Adds a prefix and a postfix to `data` such that the result can be
// appended to another such packed data and then the operation can be reversed.
// The purpose is to allow appending blobs of data to a (possibly remote) file
// such that when reading this file we can separate the blobs.
// TODO(kcc): [impl] is there a lightweight equivalent in the open-source world?
// tar sounds too heavy.
// TODO(kcc): [impl] investigate https://github.com/google/riegeli.
ByteArray PackBytesForAppendFile(const ByteArray &data);
// Unpacks `packed_data` into `unpacked` and `hashes`.
// `packed_data` is multiple data packed by PackBytesForAppendFile()
// and merged together.
// `unpacked` or `hashes` can be nullptr.
void UnpackBytesFromAppendFile(const ByteArray &packed_data,
std::vector<ByteArray> *unpacked,
std::vector<std::string> *hashes = nullptr);
// Append the bytes from 'hash' to 'ba'.
void AppendHashToArray(ByteArray &ba, std::string_view hash);
// Reverse to AppendHashToArray.
std::string ExtractHashFromArray(ByteArray &ba);
// Pack {features, Hash(data)} into a byte array.
ByteArray PackFeaturesAndHash(const ByteArray &data,
const FeatureVec &features);
// Parses `dictionary_text` representing an AFL/libFuzzer dictionary.
// https://github.com/google/AFL/blob/master/dictionaries/README.dictionaries
// https://llvm.org/docs/LibFuzzer.html#dictionaries
// Fills in `dictionary_entries` with byte sequences from the dictionary.
// Returns true iff parsing completes successfully.
bool ParseAFLDictionary(std::string_view dictionary_text,
std::vector<ByteArray> &dictionary_entries);
// Maps `size` bytes with `mmap(NO_RESERVE)` or equivalent, returns the result.
// CHECK-fails on error.
// The resulting memory is unreserved, and will be zero-initialized on first
// access.
uint8_t *MmapNoReserve(size_t size);
// Unmaps memory returned by `MmapNoReserve`().
// CHECK-fails on error.
void Munmap(uint8_t *ptr, size_t size);
// Fixed size array allocated/deallocated with MmapNoReserve/Munmap.
// operator[] checks indices for out-of-bounds.
template <size_t kSize>
class MmapNoReserveArray {
public:
MmapNoReserveArray() : array_(MmapNoReserve(kSize)) {}
~MmapNoReserveArray() { Munmap(array_, kSize); }
MmapNoReserveArray(const MmapNoReserveArray &) = delete;
MmapNoReserveArray &operator=(const MmapNoReserveArray &) = delete;
MmapNoReserveArray(MmapNoReserveArray &&) = delete;
MmapNoReserveArray &operator=(MmapNoReserveArray &&) = delete;
uint8_t operator[](size_t i) const {
CHECK_LT(i, kSize);
return array_[i];
}
uint8_t &operator[](size_t i) {
CHECK_LT(i, kSize);
return array_[i];
}
private:
uint8_t *array_;
};
} // namespace centipede
#endif // THIRD_PARTY_CENTIPEDE_UTIL_H_