| // Copyright 2022 The Centipede Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // This library defines the concepts "fuzzing feature" and "feature domain". |
| // It is used by Centipede, and it can be used by fuzz runners to |
| // define their features in a way most friendly to Centipede. |
| // Fuzz runners do not have to use this file nor to obey the rules defined here. |
| // But using this file and following its rules is the simplest way if you want |
| // Centipede to understand the details about the features generated by the |
| // runner. |
| // |
| // This library must not depend on anything other than libc so that fuzz targets |
| // using it doesn't gain redundant coverage. For the same reason this library |
| // uses raw __builtin_trap instead of CHECKs. |
| // We make an exception for <algorithm> for std::sort/std::unique, |
| // since <algorithm> is very lightweight. |
| // This library is also header-only, with all functions defined as inline. |
| |
| #ifndef THIRD_PARTY_CENTIPEDE_FEATURE_H_ |
| #define THIRD_PARTY_CENTIPEDE_FEATURE_H_ |
| |
| #include <stddef.h> |
| #include <string.h> |
| |
| // WARNING!!!: Be very careful with what STL headers or other dependencies you |
| // add here. This header needs to remain mostly bare-bones so that we can |
| // include it into runner. |
| // <vector> is an exception, because it's too clumsy w/o it, and it introduces |
| // minimal code footprint. |
| #include <cstdint> |
| #include <limits> |
| #include <memory> |
| #include <vector> |
| |
| #include "./centipede/concurrent_bitset.h" |
| #include "./centipede/int_utils.h" |
| |
| namespace centipede { |
| |
| // Feature is an integer that identifies some unique behaviour |
| // of the fuzz target exercised by a given input. |
| // We say, this input has this feature with regard to this fuzz target. |
| // One example of a feature: a certain control flow edge being executed. |
| using feature_t = uint64_t; |
| |
| // A vector of features. It is not expected to be ordered. |
| // It typically does not contain repetitions, but it's ok to have them. |
| using FeatureVec = std::vector<feature_t>; |
| |
| namespace feature_domains { |
| |
| // Feature domain is a subset of 64-bit integers dedicated to a certain |
| // kind of fuzzing features. |
| // All domains are of the same size (kDomainSize), This way, we can compute |
| // a domain for a given feature by dividing by kDomainSize. |
| class Domain { |
| public: |
| // kDomainSize is a large enough value to hold all PCs of our largest target. |
| // It is also large enough to avoid too many collisions in other domains. |
| // At the same time, it is small enough that all domains combined require |
| // not too many bits (e.g. 32 bits is a good practical limit). |
| // TODO(kcc): consider making feature_t a 32-bit type if we expect to not |
| // use more than 32 bits. |
| // NOTE: this value may change in future. |
| static constexpr size_t kDomainSize = 1ULL << 27; |
| |
| constexpr Domain(size_t domain_id) : domain_id_(domain_id) {} |
| |
| constexpr feature_t begin() const { return kDomainSize * domain_id_; } |
| constexpr feature_t end() const { return begin() + kDomainSize; } |
| bool Contains(feature_t feature) const { |
| return feature >= begin() && feature < end(); |
| } |
| constexpr size_t domain_id() const { return domain_id_; } |
| |
| // Converts any `number` into a feature in this domain. |
| feature_t ConvertToMe(size_t number) const { |
| return begin() + number % kDomainSize; |
| } |
| |
| // Returns the DomainId of the domain that the feature belongs to. |
| static size_t FeatureToDomainId(feature_t feature) { |
| return feature / kDomainSize; |
| } |
| |
| private: |
| const size_t domain_id_; |
| }; |
| |
| // Notes on Designing Features and Domains |
| // |
| // Abstractly, a "feature" signals that there was something interesting about |
| // the input that Centipede should keep investigating. After seeing a particular |
| // feature occur often enough, Centipede will become less interested. |
| // |
| // Generally, different types of features should be put in different domains. |
| // This is useful for two reasons. First, Centipede can display the feature |
| // count for each domain separately. Second, Centipede calculates features |
| // weights relative to the size of the domain. If two different types of |
| // features are squeezed into the same domain, an overabundance of one type of |
| // feature can cause the other type of feature to be undervalued. |
| // |
| // The number of features can fit inside a particular domain is finite (see |
| // kDomainSize). A feature outside that range will be mapped inside that range. |
| // If the space of all possible features is larger than kDomainSize, it is |
| // recommended that the feature value is hashed as it is calculated. Feature |
| // spaces typically have some sort of internal structure and mapping a |
| // structured feature space into kDomainSize via a modulus can create |
| // predictable aliasing. Hashing the feature value reduces the worst case effect |
| // of the feature aliasing. If hashing, it is also recommended that the domain |
| // is defined in such a way so that the number of features actually discovered |
| // in that domain stays below a fraction of kDomainSize, even if the number of |
| // possible features is huge. The more feature aliasing that occurs in practice, |
| // the less effective the domain. |
| |
| // Catch-all domain for unknown features. |
| inline constexpr Domain kUnknown = {__COUNTER__}; |
| static_assert(kUnknown.domain_id() == 0); // No one used __COUNTER__ before. |
| // Represents PCs, i.e. control flow edges. |
| // Use ConvertPCFeatureToPcIndex() to convert back to a PC index. |
| inline constexpr Domain kPCs = {__COUNTER__}; |
| static_assert(kPCs.domain_id() != kUnknown.domain_id()); // just in case. |
| // Features derived from edge counters. See Convert8bitCounterToNumber(). |
| inline constexpr Domain k8bitCounters = {__COUNTER__}; |
| // Features derived from data flow edges. |
| // A typical data flow edge is a pair of PCs: {store-PC, load-PC}. |
| // Another variant of a data flow edge is a pair of {global-address, load-PC}. |
| inline constexpr Domain kDataFlow = {__COUNTER__}; |
| // Features derived from instrumenting CMP instructions. TODO(kcc): remove. |
| inline constexpr Domain kCMP = {__COUNTER__}; |
| |
| // Features in the following domains are created for comparison instructions |
| // 'a CMP b'. One component of the feature is the context, i.e. where the |
| // comparison happened. Another component depends on {a,b}. |
| // |
| // a == b. |
| // The other domains (kCMPModDiff, kCMPHamming, kCMPDiffLog) are for a != b. |
| inline constexpr Domain kCMPEq = {__COUNTER__}; |
| // (a - b) if |a-b| < 32, see ABToCmpModDiff. |
| inline constexpr Domain kCMPModDiff = {__COUNTER__}; |
| // hamming_distance(a, b), ABToCmpHamming. |
| inline constexpr Domain kCMPHamming = {__COUNTER__}; |
| // log2(a > b ? a - b : b - a), see ABToCmpDiffLog. |
| inline constexpr Domain kCMPDiffLog = {__COUNTER__}; |
| |
| // Features derived from observing function call stacks. |
| constexpr Domain kCallStack = {__COUNTER__}; |
| // Features derived from computing (bounded) control flow paths. |
| inline constexpr Domain kBoundedPath = {__COUNTER__}; |
| // Features derived from (unordered) pairs of PCs. |
| inline constexpr Domain kPCPair = {__COUNTER__}; |
| // Features defined by a user via |
| // __attribute__((section("__centipede_extra_features"))). |
| // There is no hard guarantee how many user domains are available, feel free to |
| // add or remove domains as needed. |
| inline constexpr Domain kUserDomains[] = { |
| {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, |
| {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, |
| {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, |
| {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, {__COUNTER__}, |
| }; |
| |
| // A fake domain, not actually used, must be last. |
| inline constexpr Domain kLastDomain = {__COUNTER__}; |
| // For now, check that all domains (except maybe for kLastDomain) fit |
| // into 32 bits. |
| static_assert(kLastDomain.begin() <= (1ULL << 32)); |
| |
| // Special feature used to indicate an absence of features. Typically used where |
| // a feature array must not be empty, but doesn't have any other features. |
| inline constexpr feature_t kNoFeature = kUnknown.begin(); |
| |
| } // namespace feature_domains |
| |
| // Converts an 8-bit coverage counter, i.e. a pair of {`pc_index`, |
| // `counter_value` must not be zero. |
| // |
| // We convert the 8-bit counter value to a number from 0 to 7 |
| // by computing its binary log, i.e. 1=>0, 2=>1, 4=>2, 8=>3, ..., 128=>7. |
| // This is a heuristic, similar to that of AFL or libFuzzer |
| // that tries to encourage inputs with different number of repetitions |
| // of the same PC. |
| inline size_t Convert8bitCounterToNumber(size_t pc_index, |
| uint8_t counter_value) { |
| if (counter_value == 0) __builtin_trap(); // Wrong input. |
| // Compute a log2 of counter_value, i.e. a value between 0 and 7. |
| // __builtin_clz consumes a 32-bit integer. |
| uint32_t counter_log2 = |
| sizeof(uint32_t) * 8 - 1 - __builtin_clz(counter_value); |
| return pc_index * 8 + counter_log2; |
| } |
| |
| // Given the `feature` from the PC domain, returns the feature's |
| // pc_index. I.e. reverse of kPC.ConvertToMe(), assuming all PCs originally |
| // converted to features were less than Domain::kDomainSize. |
| inline size_t ConvertPCFeatureToPcIndex(feature_t feature) { |
| auto domain = feature_domains::kPCs; |
| if (!domain.Contains(feature)) __builtin_trap(); |
| return feature - domain.begin(); |
| } |
| |
| // Encodes {`pc1`, `pc2`} into a number. |
| // `pc1` and `pc2` are in range [0, `max_pc`) |
| inline size_t ConvertPcPairToNumber(uintptr_t pc1, uintptr_t pc2, |
| uintptr_t max_pc) { |
| return pc1 * max_pc + pc2; |
| } |
| |
| // Transforms {a,b}, a!=b, into a number in [0,64) using a-b. |
| inline uintptr_t ABToCmpModDiff(uintptr_t a, uintptr_t b) { |
| uintptr_t diff = a - b; |
| return diff <= 32 ? diff : -diff < 32 ? 32 + -diff : 0; |
| } |
| |
| // Transforms {a,b}, a!=b, into a number in [0,64) using hamming distance. |
| inline uintptr_t ABToCmpHamming(uintptr_t a, uintptr_t b) { |
| return __builtin_popcountll(a ^ b) - 1; |
| } |
| |
| // Transforms {a,b}, a!=b, into a number in [0,64) using log2(a-b). |
| inline uintptr_t ABToCmpDiffLog(uintptr_t a, uintptr_t b) { |
| return __builtin_clzll(a > b ? a - b : b - a); |
| } |
| |
| // A simple fixed-capacity array with push_back. |
| // Thread-compatible. |
| template <size_t kSize> |
| class FeatureArray { |
| public: |
| // Constructs an empty feature array. |
| FeatureArray() = default; |
| |
| // pushes `feature` back if there is enough space. |
| void push_back(feature_t feature) { |
| if (num_features_ < kSize) { |
| features_[num_features_++] = feature; |
| } |
| } |
| |
| // Makes the array empty. |
| void clear() { num_features_ = 0; } |
| |
| // Returns the array's raw data. |
| feature_t *data() { return &features_[0]; } |
| |
| // Returns the number of elements in the array. |
| size_t size() const { return num_features_; } |
| |
| private: |
| // NOTE: No initializer needed: object state is captured by `num_features_`. |
| feature_t features_[kSize]; |
| size_t num_features_ = 0; |
| }; |
| |
| } // namespace centipede |
| |
| #endif // THIRD_PARTY_CENTIPEDE_FEATURE_H_ |