centipede/centipede.cc - third_party/github/google/fuzztest - Git at Google

 // Copyright 2022 The Centipede Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Centipede: an experimental distributed fuzzing engine.
 // Very simple / naive so far.
 // Main use case: large out-of-process fuzz targets with relatively slow
 // execution (< 100 exec/s).
 //
 // Basic approach (subject to change):
 // * All state is stored in a local or remote directory `workdir`.
 // * State consists of a corpus (inputs) and feature sets (see feature_t).
 // * Feature sets are associated with a binary, so that two binaries
 //   have independent feature sets stored in different subdirs in `workdir`,
 //   like binaryA-sha1-of-A and binaryB-sha1-of-B.
 //   If the binary is recompiled at different revision or with different
 //   compiler options, it is a different binary and feature sets will need to be
 //   recomputed for the new binary in its separate dir.
 // * The corpus is not tied to the binary. It is stored in `workdir`/.
 // * The fuzzer runs in `total_shards` independent processes.
 // * Each shard appends data to its own files in `workdir`: corpus and features;
 //   no other process writes to those files.
 // * Each shard may periodically read some other shard's corpus and features.
 //   Since all files are append-only (no renames, no deletions) we may only
 //   have partial reads, and the algorithm is expected to tolerate those.
 // * Fuzzing can be run locally in multiple processes, with a local `workdir`
 //   or on a cluster, which supports `workdir` on a remote file system.
 // * The intent is to scale to an arbitrary number of shards,
 //   currently tested with total_shards = 10000.
 //
 //  Differential fuzzing is not yet properly implemented.
 //  Currently, one can run target A in a given workdir, then target B, and so
 //  on, and the corpus will grow over time benefiting from all targets.
 #include "./centipede/centipede.h"

 #include <algorithm>
 #include <atomic>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <filesystem>  // NOLINT
 #include <functional>
 #include <iostream>
 #include <memory>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>

 #include "absl/base/attributes.h"
 #include "absl/base/const_init.h"  // NOLINT
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "./centipede/binary_info.h"
 #include "./centipede/centipede_callbacks.h"
 #include "./centipede/command.h"
 #include "./centipede/control_flow.h"
 #include "./centipede/corpus_io.h"
 #include "./centipede/coverage.h"
 #include "./centipede/environment.h"
 #include "./centipede/feature.h"
 #include "./centipede/feature_set.h"
 #include "./centipede/mutation_input.h"
 #include "./centipede/runner_result.h"
 #include "./centipede/rusage_profiler.h"
 #include "./centipede/rusage_stats.h"
 #include "./centipede/stats.h"
 #include "./centipede/stop.h"
 #include "./centipede/util.h"
 #include "./centipede/workdir.h"
 #include "./common/blob_file.h"
 #include "./common/defs.h"
 #include "./common/hash.h"
 #include "./common/logging.h"
 #include "./common/remote_file.h"
 #include "./common/status_macros.h"

 namespace fuzztest::internal {

 Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks,
                      const BinaryInfo &binary_info,
                      CoverageLogger &coverage_logger, std::atomic<Stats> &stats)
     : env_(env),
       user_callbacks_(user_callbacks),
       rng_(env_.seed),
       // TODO(kcc): [impl] find a better way to compute frequency_threshold.
       fs_(env_.feature_frequency_threshold, env_.MakeDomainDiscardMask()),
       coverage_frontier_(binary_info),
       binary_info_(binary_info),
       pc_table_(binary_info_.pc_table),
       symbols_(binary_info_.symbols),
       function_filter_(env_.function_filter, symbols_),
       coverage_logger_(coverage_logger),
       stats_(stats),
       input_filter_path_(std::filesystem::path(TemporaryLocalDirPath())
                              .append("filter-input")),
       input_filter_cmd_{[&] {
         Command::Options cmd_options;
         cmd_options.args = {input_filter_path_};
         cmd_options.stdout_file = "/dev/null";
         cmd_options.stderr_file = "/dev/null";
         return Command{env_.input_filter, std::move(cmd_options)};
       }()},
       rusage_profiler_(
           /*scope=*/RUsageScope::ThisProcess(),
           /*metrics=*/env.DumpRUsageTelemetryInThisShard()
               ? RUsageProfiler::kAllMetrics
               : RUsageProfiler::kMetricsOff,
           /*raii_actions=*/RUsageProfiler::kRaiiOff,
           /*location=*/{__FILE__, __LINE__},
           /*description=*/"Engine") {
   FUZZTEST_CHECK(env_.seed) << "env_.seed must not be zero";
   if (!env_.input_filter.empty() && env_.fork_server)
     input_filter_cmd_.StartForkServer(TemporaryLocalDirPath(), "input_filter");
 }

 void Centipede::CorpusToFiles(const Environment &env, std::string_view dir) {
   std::vector<std::string> sharded_corpus_files;
   FUZZTEST_CHECK_OK(RemoteGlobMatch(
       WorkDir{env}.CorpusFilePaths().AllShardsGlob(), sharded_corpus_files));
   ExportCorpus(sharded_corpus_files, dir);
 }

 void Centipede::CorpusFromFiles(const Environment &env, std::string_view dir) {
   // Shard the file paths in the source `dir` based on hashes of filenames.
   // Such partition is stable: a given file always goes to a specific shard.
   std::vector<std::vector<std::string>> sharded_paths(env.total_shards);
   std::vector<std::string> paths;
   size_t total_paths = 0;
   const std::vector<std::string> listed_paths =
       ValueOrDie(RemoteListFiles(dir, /*recursively=*/true));
   for (const std::string &path : listed_paths) {
     size_t filename_hash = std::hash<std::string>{}(path);
     sharded_paths[filename_hash % env.total_shards].push_back(path);
     ++total_paths;
   }

   // If the destination `workdir` is specified (note that empty means "use the
   // current directory"), we might need to create it.
   if (!env.workdir.empty()) {
     FUZZTEST_CHECK_OK(RemoteMkdir(env.workdir));
   }

   // Iterate over all shards, adding inputs to the current shard.
   size_t inputs_added = 0;
   size_t inputs_ignored = 0;
   const auto corpus_file_paths = WorkDir{env}.CorpusFilePaths();
   for (size_t shard = 0; shard < env.total_shards; shard++) {
     const std::string corpus_file_path = corpus_file_paths.Shard(shard);
     size_t num_shard_bytes = 0;
     // Read the shard (if it exists), collect input hashes from it.
     absl::flat_hash_set<std::string> existing_hashes;
     if (RemotePathExists(corpus_file_path)) {
       auto reader = DefaultBlobFileReaderFactory();
       // May fail to open if file doesn't exist.
       reader->Open(corpus_file_path).IgnoreError();
       ByteSpan blob;
       while (reader->Read(blob).ok()) {
         existing_hashes.insert(Hash(blob));
       }
     }
     // Add inputs to the current shard, if the shard doesn't have them already.
     auto appender = DefaultBlobFileWriterFactory(env.riegeli);
     FUZZTEST_CHECK_OK(appender->Open(corpus_file_path, "a"))
         << "Failed to open corpus file: " << corpus_file_path;
     ByteArray shard_data;
     for (const auto &path : sharded_paths[shard]) {
       std::string input;
       FUZZTEST_CHECK_OK(RemoteFileGetContents(path, input));
       if (input.empty() || existing_hashes.contains(Hash(input))) {
         ++inputs_ignored;
         continue;
       }
       FUZZTEST_CHECK_OK(appender->Write(ByteArray{input.begin(), input.end()}));
       ++inputs_added;
     }
     FUZZTEST_LOG(INFO) << VV(shard) << VV(inputs_added) << VV(inputs_ignored)
                        << VV(num_shard_bytes) << VV(shard_data.size());
   }
   FUZZTEST_CHECK_EQ(total_paths, inputs_added + inputs_ignored);
 }

 absl::Status Centipede::CrashesToFiles(const Environment &env,
                                        std::string_view dir) {
   std::vector<std::string> reproducer_dirs;
   const auto wd = WorkDir{env};
   auto reproducer_match_status = RemoteGlobMatch(
       wd.CrashReproducerDirPaths().AllShardsGlob(), reproducer_dirs);
   if (!reproducer_match_status.ok() &&
       !absl::IsNotFound(reproducer_match_status)) {
     return reproducer_match_status;
   }
   absl::flat_hash_set<std::string> crash_ids;
   for (const auto &reproducer_dir : reproducer_dirs) {
     ASSIGN_OR_RETURN_IF_NOT_OK(
         std::vector<std::string> reproducer_paths,
         RemoteListFiles(reproducer_dir, /*recursively=*/false));
     for (const auto &reproducer_path : reproducer_paths) {
       std::string id = std::filesystem::path{reproducer_path}.filename();
       if (auto [_it, inserted] = crash_ids.insert(id); !inserted) {
         continue;
       }
       RETURN_IF_NOT_OK(RemoteFileCopy(
           reproducer_path,
           (std::filesystem::path{dir} / absl::StrCat(id, ".data")).string()));
       const auto shard_index = wd.CrashReproducerDirPaths().GetShardIndex(
           std::filesystem::path{reproducer_path}.parent_path().string());
       FUZZTEST_CHECK(shard_index.has_value());
       const auto metadata_dir = wd.CrashMetadataDirPaths().Shard(*shard_index);
       const auto description_filename = absl::StrCat(id, ".desc");
       const auto signature_filename = absl::StrCat(id, ".sig");
       RETURN_IF_NOT_OK(RemoteFileCopy(
           (std::filesystem::path{metadata_dir} / description_filename).string(),
           (std::filesystem::path{dir} / description_filename).string()));
       RETURN_IF_NOT_OK(RemoteFileCopy(
           (std::filesystem::path{metadata_dir} / signature_filename).string(),
           (std::filesystem::path{dir} / signature_filename).string()));
     }
   }
   return absl::OkStatus();
 }

 void Centipede::UpdateAndMaybeLogStats(std::string_view log_type,
                                        size_t min_log_level) {
   // `fuzz_start_time_ == ` means that fuzzing hasn't started yet. If so, grab
   // the baseline numbers.
   const double fuzz_time_secs =
       fuzz_start_time_ == absl::InfiniteFuture()
           ? 0
           : absl::ToDoubleSeconds(absl::Now() - fuzz_start_time_);
   const double execs_per_sec =
       fuzz_time_secs == 0 ? 0 : (1.0 * num_runs_ / fuzz_time_secs);
   const auto [max_corpus_size, avg_corpus_size] = corpus_.MaxAndAvgSize();

   // NOTE: For now, this will double-count rusage in every shard on the same
   // machine. The stats reporter knows and deals with that.
   static const auto rusage_scope = RUsageScope::ThisProcess();
   const auto rusage_timing = RUsageTiming::Snapshot(rusage_scope);
   const auto rusage_memory = RUsageMemory::Snapshot(rusage_scope);

   namespace fd = feature_domains;

   stats_.store(Stats{
       StatsMeta{
           /*timestamp_unix_micros=*/
           static_cast<uint64_t>(absl::ToUnixMicros(absl::Now())),
       },
       ExecStats{
           /*fuzz_time_sec=*/static_cast<uint64_t>(std::ceil(fuzz_time_secs)),
           /*num_executions*/ num_runs_,
           /*num_target_crashes*/ static_cast<uint64_t>(num_crashes_),
       },
       CovStats{
           /*num_covered_pcs=*/fs_.CountFeatures(fd::kPCs),
           /*num_8bit_counter_features=*/fs_.CountFeatures(fd::k8bitCounters),
           /*num_data_flow_features=*/fs_.CountFeatures(fd::kDataFlow),
           /*num_cmp_features=*/fs_.CountFeatures(fd::kCMPDomains),
           /*num_call_stack_features=*/fs_.CountFeatures(fd::kCallStack),
           /*num_bounded_path_features=*/fs_.CountFeatures(fd::kBoundedPath),
           /*num_pc_pair_features=*/fs_.CountFeatures(fd::kPCPair),
           /*num_user_features=*/fs_.CountFeatures(fd::kUserDomains),
           /*num_user0_features=*/fs_.CountFeatures(fd::kUserDomains[0]),
           /*num_user1_features=*/fs_.CountFeatures(fd::kUserDomains[1]),
           /*num_user2_features=*/fs_.CountFeatures(fd::kUserDomains[2]),
           /*num_user3_features=*/fs_.CountFeatures(fd::kUserDomains[3]),
           /*num_user4_features=*/fs_.CountFeatures(fd::kUserDomains[4]),
           /*num_user5_features=*/fs_.CountFeatures(fd::kUserDomains[5]),
           /*num_user6_features=*/fs_.CountFeatures(fd::kUserDomains[6]),
           /*num_user7_features=*/fs_.CountFeatures(fd::kUserDomains[7]),
           /*num_user8_features=*/fs_.CountFeatures(fd::kUserDomains[8]),
           /*num_user9_features=*/fs_.CountFeatures(fd::kUserDomains[9]),
           /*num_user10_features=*/fs_.CountFeatures(fd::kUserDomains[10]),
           /*num_user11_features=*/fs_.CountFeatures(fd::kUserDomains[11]),
           /*num_user12_features=*/fs_.CountFeatures(fd::kUserDomains[12]),
           /*num_user13_features=*/fs_.CountFeatures(fd::kUserDomains[13]),
           /*num_user14_features=*/fs_.CountFeatures(fd::kUserDomains[14]),
           /*num_user15_features=*/fs_.CountFeatures(fd::kUserDomains[15]),
           /*num_unknown_features=*/fs_.CountFeatures(fd::kUnknown),
           /*num_funcs_in_frontier=*/coverage_frontier_.NumFunctionsInFrontier(),
       },
       CorpusStats{
           /*active_corpus_size=*/corpus_.NumActive(),
           /*total_corpus_size=*/corpus_.NumTotal(),
           /*max_corpus_element_size=*/max_corpus_size,
           /*avg_corpus_element_size=*/avg_corpus_size,
       },
       RusageStats{
           /*engine_rusage_avg_millicores=*/static_cast<uint64_t>(
               std::lround(rusage_timing.cpu_hyper_cores * 1000)),
           /*engine_rusage_cpu_percent=*/
           static_cast<uint64_t>(
               std::lround(rusage_timing.cpu_utilization * 100)),
           /*engine_rusage_rss_mb=*/
           static_cast<uint64_t>(rusage_memory.mem_rss >> 20),
           /*engine_rusage_vsize_mb=*/
           static_cast<uint64_t>(rusage_memory.mem_vsize >> 20),
       },
   });

   if (env_.log_level < min_log_level) return;

   std::ostringstream os;
   auto LogIfNotZero = [&os](size_t value, std::string_view name) {
     if (!value) return;
     os << " " << name << ": " << value;
   };
   if (!env_.experiment_name.empty()) os << env_.experiment_name << " ";
   os << "[S" << env_.my_shard_index << "." << num_runs_ << "] " << log_type
      << ": ";
   os << fs_;
   os << " corp: " << corpus_.NumActive() << "/" << corpus_.NumTotal();
   LogIfNotZero(coverage_frontier_.NumFunctionsInFrontier(), "fr");
   LogIfNotZero(num_crashes_, "crash");
   os << " max/avg: " << max_corpus_size << "/" << avg_corpus_size << " "
      << corpus_.MemoryUsageString();
   os << " exec/s: "
      << (execs_per_sec < 1.0 ? execs_per_sec : std::round(execs_per_sec));
   os << " mb: " << (rusage_memory.mem_rss >> 20);
   FUZZTEST_LOG(INFO) << os.str();
 }

 void Centipede::LogFeaturesAsSymbols(const FeatureVec &fv) {
   if (!env_.LogFeaturesInThisShard()) return;
   for (auto feature : fv) {
     if (!feature_domains::kPCs.Contains(feature)) continue;
     PCIndex pc_index = ConvertPCFeatureToPcIndex(feature);
     auto description = coverage_logger_.ObserveAndDescribeIfNew(pc_index);
     if (description.empty()) continue;
     FUZZTEST_LOG(INFO) << description;
   }
 }

 bool Centipede::InputPassesFilter(const ByteArray &input) {
   if (env_.input_filter.empty()) return true;
   WriteToLocalFile(input_filter_path_, input);
   bool result = input_filter_cmd_.Execute() == EXIT_SUCCESS;
   std::filesystem::remove(input_filter_path_);
   return result;
 }

 bool Centipede::ExecuteAndReportCrash(std::string_view binary,
                                       const std::vector<ByteArray> &input_vec,
                                       BatchResult &batch_result) {
   bool success = user_callbacks_.Execute(binary, input_vec, batch_result);
   if (!success) ReportCrash(binary, input_vec, batch_result);
   return success || batch_result.IsIgnoredFailure();
 }

 // *** Highly experimental and risky. May not scale well for large targets. ***
 //
 // The idea: an unordered pair of two features {a, b} is by itself a feature.
 // In the worst case, the number of such synthetic features is a square of
 // the number of regular features, which may not scale.
 // For now, we only treat pairs of PCs as features, which is still quadratic
 // by the number of PCs. But in moderate-sized programs this may be tolerable.
 //
 // Rationale: if two different parts of the target are exercised simultaneously,
 // this may create interesting behaviour that is hard to capture with regular
 // control flow (or other) features.
 size_t Centipede::AddPcPairFeatures(FeatureVec &fv) {
   // Using a scratch vector to avoid allocations.
   auto &pcs = add_pc_pair_scratch_;
   pcs.clear();

   size_t num_pcs = pc_table_.size();
   size_t num_added_pairs = 0;

   // Collect PCs from fv.
   for (auto feature : fv) {
     if (feature_domains::kPCs.Contains(feature))
       pcs.push_back(ConvertPCFeatureToPcIndex(feature));
   }

   // The quadratic loop: iterate all PC pairs (!!).
   for (size_t i = 0, n = pcs.size(); i < n; ++i) {
     size_t pc1 = pcs[i];
     for (size_t j = i + 1; j < n; ++j) {
       size_t pc2 = pcs[j];
       feature_t f = feature_domains::kPCPair.ConvertToMe(
           ConvertPcPairToNumber(pc1, pc2, num_pcs));
       // If we have seen this pair at least once, ignore it.
       if (fs_.Frequency(f) != 0) continue;
       fv.push_back(f);
       ++num_added_pairs;
     }
   }
   return num_added_pairs;
 }

 bool Centipede::RunBatch(
     const std::vector<ByteArray> &input_vec,
     BlobFileWriter *absl_nullable corpus_file,
     BlobFileWriter *absl_nullable features_file,
     BlobFileWriter *absl_nullable unconditional_features_file) {
   BatchResult batch_result;
   bool success = ExecuteAndReportCrash(env_.binary, input_vec, batch_result);
   FUZZTEST_CHECK_EQ(input_vec.size(), batch_result.results().size());

   for (const auto &extra_binary : env_.extra_binaries) {
     if (ShouldStop()) break;
     BatchResult extra_batch_result;
     success =
         ExecuteAndReportCrash(extra_binary, input_vec, extra_batch_result) &&
         success;
   }
   if (EarlyStopRequested()) return false;
   if (!success && env_.exit_on_crash) {
     FUZZTEST_LOG(INFO) << "--exit_on_crash is enabled; exiting soon";
     RequestEarlyStop(EXIT_FAILURE);
     return false;
   }
   FUZZTEST_CHECK_EQ(batch_result.results().size(), input_vec.size());
   num_runs_ += input_vec.size();
   bool batch_gained_new_coverage = false;
   for (size_t i = 0; i < input_vec.size(); i++) {
     if (ShouldStop()) break;
     FeatureVec &fv = batch_result.results()[i].mutable_features();
     bool function_filter_passed = function_filter_.filter(fv);
     bool input_gained_new_coverage = fs_.PruneFeaturesAndCountUnseen(fv) != 0;
     if (env_.use_pcpair_features && AddPcPairFeatures(fv) != 0)
       input_gained_new_coverage = true;
     if (unconditional_features_file != nullptr) {
       FUZZTEST_CHECK_OK(unconditional_features_file->Write(
           PackFeaturesAndHash(input_vec[i], fv)));
     }
     if (input_gained_new_coverage) {
       // TODO(kcc): [impl] add stats for filtered-out inputs.
       if (!InputPassesFilter(input_vec[i])) continue;
       fs_.MergeFeatures(fv);
       LogFeaturesAsSymbols(fv);
       batch_gained_new_coverage = true;
       FUZZTEST_CHECK_GT(fv.size(), 0UL);
       if (function_filter_passed) {
         corpus_.Add(input_vec[i], fv, batch_result.results()[i].metadata(), fs_,
                     coverage_frontier_);
       }
       if (corpus_file != nullptr) {
         FUZZTEST_CHECK_OK(corpus_file->Write(input_vec[i]));
       }
       if (!env_.corpus_dir.empty() && !env_.corpus_dir[0].empty()) {
         WriteToLocalHashedFileInDir(env_.corpus_dir[0], input_vec[i]);
       }
       if (features_file != nullptr) {
         FUZZTEST_CHECK_OK(
             features_file->Write(PackFeaturesAndHash(input_vec[i], fv)));
       }
     }
   }
   return batch_gained_new_coverage;
 }

 // TODO(kcc): [impl] don't reread the same corpus twice.
 void Centipede::LoadShard(const Environment &load_env, size_t shard_index,
                           bool rerun) {
   FUZZTEST_VLOG(1) << "Loading shard " << shard_index
                    << (rerun ? " with rerunning" : " without rerunning");
   size_t num_added_inputs = 0;
   size_t num_skipped_inputs = 0;
   std::vector<ByteArray> inputs_to_rerun;
   auto input_features_callback = [&](ByteArray input,
                                      FeatureVec input_features) {
     if (ShouldStop()) return;
     if (input_features.empty()) {
       if (rerun) {
         inputs_to_rerun.emplace_back(std::move(input));
       }
     } else {
       LogFeaturesAsSymbols(input_features);
       const auto num_new_features =
           fs_.PruneFeaturesAndCountUnseen(input_features);
       if (num_new_features != 0) {
         FUZZTEST_VLOG(10) << "Adding input " << Hash(input)
                           << "; new features: " << num_new_features;
         fs_.MergeFeatures(input_features);
         // TODO(kcc): cmp_args are currently not saved to disk and not reloaded.
         corpus_.Add(input, input_features, {}, fs_, coverage_frontier_);
         ++num_added_inputs;
       } else {
         FUZZTEST_VLOG(10) << "Skipping input: " << Hash(input);
         ++num_skipped_inputs;
       }
     }
   };

   // See serialize_shard_loads on why we may want to serialize shard loads.
   // TODO(kcc): remove serialize_shard_loads when LoadShards() uses less RAM.
   const WorkDir wd{load_env};
   const std::string corpus_path = wd.CorpusFilePaths().Shard(shard_index);
   const std::string features_path = wd.FeaturesFilePaths().Shard(shard_index);
   if (env_.serialize_shard_loads) {
     ABSL_CONST_INIT static absl::Mutex load_shard_mu{absl::kConstInit};
     absl::MutexLock lock(&load_shard_mu);
     ReadShard(corpus_path, features_path, input_features_callback);
   } else {
     ReadShard(corpus_path, features_path, input_features_callback);
   }

   FUZZTEST_VLOG(1) << "Loaded shard " << shard_index << ": added "
                    << num_added_inputs << " / skipped " << num_skipped_inputs
                    << " inputs";

   if (num_added_inputs > 0) UpdateAndMaybeLogStats("load-shard", 1);
   if (!inputs_to_rerun.empty()) Rerun(inputs_to_rerun);
 }

 void Centipede::LoadAllShardsInRandomOrder(const Environment &load_env,
                                            bool rerun_my_shard) {
   // TODO(ussuri): It seems logical to reset `corpus_` before this, but
   //  that broke `ShardsAndDistillTest` in testing/centipede_test.cc.
   //  Investigate.
   std::vector<size_t> shard_idxs(env_.total_shards);
   std::iota(shard_idxs.begin(), shard_idxs.end(), 0);
   std::shuffle(shard_idxs.begin(), shard_idxs.end(), rng_);
   size_t num_shards_loaded = 0;
   for (size_t shard_idx : shard_idxs) {
     const bool rerun = rerun_my_shard && shard_idx == env_.my_shard_index;
     LoadShard(load_env, shard_idx, rerun);
     FUZZTEST_LOG_IF(INFO, (++num_shards_loaded % 100) == 0)
         << VV(num_shards_loaded);
   }
 }

 void Centipede::Rerun(std::vector<ByteArray> &to_rerun) {
   if (to_rerun.empty()) return;
   auto features_file_path = wd_.FeaturesFilePaths().Shard(env_.my_shard_index);
   auto features_file = DefaultBlobFileWriterFactory(env_.riegeli);
   FUZZTEST_CHECK_OK(features_file->Open(features_file_path, "a"));

   FUZZTEST_LOG(INFO) << to_rerun.size() << " inputs to rerun";
   // Re-run all inputs for which we don't know their features.
   // Run in batches of at most env_.batch_size inputs each.
   while (!to_rerun.empty()) {
     if (ShouldStop()) break;
     size_t batch_size = std::min(to_rerun.size(), env_.batch_size);
     std::vector<ByteArray> batch(to_rerun.end() - batch_size, to_rerun.end());
     to_rerun.resize(to_rerun.size() - batch_size);
     if (RunBatch(batch, nullptr, nullptr, features_file.get())) {
       UpdateAndMaybeLogStats("rerun-old", 1);
     }
   }
 }

 void Centipede::GenerateCoverageReport(std::string_view filename_annotation,
                                        std::string_view description) {
   if (pc_table_.empty()) return;

   auto coverage_path = wd_.CoverageReportPath(filename_annotation);
   FUZZTEST_LOG(INFO) << "Generate coverage report [" << description << "]; "
                      << VV(coverage_path);
   auto pci_vec = fs_.ToCoveragePCs();
   Coverage coverage(pc_table_, pci_vec);
   coverage.DumpReportToFile(symbols_, coverage_path, description);
 }

 void Centipede::GenerateCorpusStats(std::string_view filename_annotation,
                                     std::string_view description) {
   auto stats_path = wd_.CorpusStatsPath(filename_annotation);
   FUZZTEST_LOG(INFO) << "Generate corpus stats [" << description << "]; "
                      << VV(stats_path);
   corpus_.DumpStatsToFile(fs_, stats_path, description);
 }

 // TODO(nedwill): add integration test once tests are refactored per b/255660879
 void Centipede::GenerateSourceBasedCoverageReport(
     std::string_view filename_annotation, std::string_view description) {
   if (env_.clang_coverage_binary.empty()) return;

   auto report_path = wd_.SourceBasedCoverageReportPath(filename_annotation);
   FUZZTEST_LOG(INFO) << "Generate source based coverage report [" << description
                      << "]; " << VV(report_path);
   FUZZTEST_CHECK_OK(RemoteMkdir(report_path));

   std::vector<std::string> raw_profiles = wd_.EnumerateRawCoverageProfiles();

   if (raw_profiles.empty()) {
     FUZZTEST_LOG(ERROR) << "No raw profiles found for coverage report";
     return;
   }

   std::string indexed_profile_path =
       wd_.SourceBasedCoverageIndexedProfilePath();

   std::vector<std::string> merge_arguments = {"merge", "-o",
                                               indexed_profile_path, "-sparse"};
   for (const std::string &raw_profile : raw_profiles) {
     merge_arguments.push_back(raw_profile);
   }

   Command::Options merge_cmd_options;
   merge_cmd_options.args = std::move(merge_arguments);
   Command merge_command{"llvm-profdata", std::move(merge_cmd_options)};
   if (merge_command.Execute() != EXIT_SUCCESS) {
     FUZZTEST_LOG(ERROR) << "Failed to run command " << merge_command.ToString();
     return;
   }

   Command::Options generate_report_cmd_options;
   generate_report_cmd_options.args = {
       "show", "-format=html", absl::StrCat("-output-dir=", report_path),
       absl::StrCat("-instr-profile=", indexed_profile_path),
       env_.clang_coverage_binary};
   Command generate_report_command{"llvm-cov",
                                   std::move(generate_report_cmd_options)};
   if (generate_report_command.Execute() != EXIT_SUCCESS) {
     FUZZTEST_LOG(ERROR) << "Failed to run command "
                         << generate_report_command.ToString();
     return;
   }
 }

 void Centipede::GenerateRUsageReport(std::string_view filename_annotation,
                                      std::string_view description) {
   class ReportDumper : public RUsageProfiler::ReportSink {
    public:
     explicit ReportDumper(std::string_view path)
         : file_{*RemoteFileOpen(path, "w")} {
       FUZZTEST_CHECK(file_ != nullptr) << VV(path);
       FUZZTEST_CHECK_OK(
           RemoteFileSetWriteBufferSize(file_, 10UL * 1024 * 1024));
     }

     ~ReportDumper() override { FUZZTEST_CHECK_OK(RemoteFileClose(file_)); }

     ReportDumper &operator<<(std::string_view fragment) override {
       FUZZTEST_CHECK_OK(RemoteFileAppend(
           file_, ByteArray{fragment.cbegin(), fragment.cend()}));
       return *this;
     }

    private:
     RemoteFile *file_;
   };

   const auto &snapshot = rusage_profiler_.TakeSnapshot(
       {__FILE__, __LINE__}, std::string{description});
   FUZZTEST_VLOG(1) << "Rusage @ " << description << ": "
                    << snapshot.ShortMetricsStr();
   auto path = wd_.RUsageReportPath(filename_annotation);
   FUZZTEST_LOG(INFO) << "Generate rusage report [" << description << "]; "
                      << VV(env_.my_shard_index) << VV(path);
   ReportDumper dumper{path};
   rusage_profiler_.GenerateReport(&dumper);
 }

 void Centipede::MaybeGenerateTelemetry(std::string_view filename_annotation,
                                        std::string_view description) {
   if (env_.DumpCorpusTelemetryInThisShard()) {
     GenerateCoverageReport(filename_annotation, description);
     GenerateCorpusStats(filename_annotation, description);
     GenerateSourceBasedCoverageReport(filename_annotation, description);
   }
   if (env_.DumpRUsageTelemetryInThisShard()) {
     GenerateRUsageReport(filename_annotation, description);
   }
 }

 void Centipede::MaybeGenerateTelemetryAfterBatch(
     std::string_view filename_annotation, size_t batch_index) {
   if (env_.DumpTelemetryForThisBatch(batch_index)) {
     MaybeGenerateTelemetry(  //
         filename_annotation, absl::StrCat("After batch ", batch_index));
   }
 }

 void Centipede::MergeFromOtherCorpus(std::string_view merge_from_dir,
                                      size_t shard_index_to_merge) {
   FUZZTEST_LOG(INFO) << __func__ << ": " << merge_from_dir;
   Environment merge_from_env = env_;
   merge_from_env.workdir = merge_from_dir;
   size_t initial_corpus_size = corpus_.NumActive();
   LoadShard(merge_from_env, shard_index_to_merge, /*rerun=*/true);
   size_t new_corpus_size = corpus_.NumActive();
   FUZZTEST_CHECK_GE(new_corpus_size,
                     initial_corpus_size);  // Corpus can't shrink here.
   if (new_corpus_size > initial_corpus_size) {
     auto appender = DefaultBlobFileWriterFactory(env_.riegeli);
     FUZZTEST_CHECK_OK(
         appender->Open(wd_.CorpusFilePaths().Shard(env_.my_shard_index), "a"));
     for (size_t idx = initial_corpus_size; idx < new_corpus_size; ++idx) {
       FUZZTEST_CHECK_OK(appender->Write(corpus_.Get(idx)));
     }
     FUZZTEST_LOG(INFO) << "Merge: " << (new_corpus_size - initial_corpus_size)
                        << " new inputs added";
   }
 }

 void Centipede::ReloadAllShardsAndWriteDistilledCorpus() {
   // Reload the shards. This automatically distills the corpus by discarding
   // inputs with duplicate feature sets as they are being added. Reloading
   // randomly leaves random winners from such sets of duplicates in the
   // distilled output: so multiple distilling shards will produce different
   // outputs from the same inputs (the property that we want).
   LoadAllShardsInRandomOrder(env_, /*rerun_my_shard=*/false);

   // Save the distilled corpus to a file in workdir and possibly to a hashed
   // file in the first corpus dir passed in `--corpus_dir`.
   const auto distill_to_path = wd_.DistilledCorpusFilePaths().MyShard();
   FUZZTEST_LOG(INFO) << "Distilling: shard: " << env_.my_shard_index
                      << " output: " << distill_to_path << " "
                      << " distilled size: " << corpus_.NumActive();
   const auto appender = DefaultBlobFileWriterFactory(env_.riegeli);
   // NOTE: Always overwrite distilled corpus files -- never append, unlike
   // "regular", per-shard corpus files.
   FUZZTEST_CHECK_OK(appender->Open(distill_to_path, "w"));
   for (size_t i = 0; i < corpus_.NumActive(); ++i) {
     const ByteArray &input = corpus_.Get(i);
     FUZZTEST_CHECK_OK(appender->Write(input));
     if (!env_.corpus_dir.empty() && !env_.corpus_dir[0].empty()) {
       WriteToLocalHashedFileInDir(env_.corpus_dir[0], input);
     }
   }
 }

 void Centipede::LoadSeedInputs(BlobFileWriter *absl_nonnull corpus_file,
                                BlobFileWriter *absl_nonnull features_file) {
   std::vector<ByteArray> seed_inputs;
   const size_t num_seeds_available =
       user_callbacks_.GetSeeds(env_.batch_size, seed_inputs);
   if (num_seeds_available > env_.batch_size) {
     FUZZTEST_LOG(WARNING) << "More seeds available than requested: "
                           << num_seeds_available << " > " << env_.batch_size;
   }
   if (seed_inputs.empty()) {
     FUZZTEST_QCHECK(!env_.require_seeds)
         << "No seeds returned and --require_seeds=true, exiting early.";
     FUZZTEST_LOG(WARNING)
         << "No seeds returned - will use the default seed of single byte {0}";
     seed_inputs.push_back({0});
   }

   RunBatch(seed_inputs, corpus_file, features_file,
            /*unconditional_features_file=*/nullptr);
   FUZZTEST_LOG(INFO) << "Number of input seeds available: "
                      << num_seeds_available
                      << ", number included in corpus: " << corpus_.NumTotal();

   // Forcely add all seed inputs to avoid empty corpus if none of them increased
   // coverage and passed the filters.
   if (corpus_.NumTotal() == 0) {
     for (const auto &seed_input : seed_inputs)
       corpus_.Add(seed_input, {}, {}, fs_, coverage_frontier_);
   }
 }

 void Centipede::FuzzingLoop() {
   FUZZTEST_LOG(INFO) << "Shard: " << env_.my_shard_index << "/"
                      << env_.total_shards << " " << TemporaryLocalDirPath()
                      << " "
                      << "seed: " << env_.seed << "\n\n\n";

   UpdateAndMaybeLogStats("begin-fuzz", 0);

   if (env_.full_sync) {
     LoadAllShardsInRandomOrder(env_, /*rerun_my_shard=*/true);
   } else {
     LoadShard(env_, env_.my_shard_index, /*rerun=*/true);
   }

   if (!env_.merge_from.empty()) {
     // Merge a shard with the same index from another corpus.
     MergeFromOtherCorpus(env_.merge_from, env_.my_shard_index);
   }

   if (env_.load_shards_only) return;

   auto corpus_path = wd_.CorpusFilePaths().Shard(env_.my_shard_index);
   auto corpus_file = DefaultBlobFileWriterFactory(env_.riegeli);
   FUZZTEST_CHECK_OK(corpus_file->Open(corpus_path, "a"));
   auto features_path = wd_.FeaturesFilePaths().Shard(env_.my_shard_index);
   auto features_file = DefaultBlobFileWriterFactory(env_.riegeli);
   FUZZTEST_CHECK_OK(features_file->Open(features_path, "a"));

   LoadSeedInputs(corpus_file.get(), features_file.get());

   UpdateAndMaybeLogStats("init-done", 0);

   // If we're going to fuzz, dump the initial telemetry files. For a brand-new
   // run, these will be functionally empty, e.g. the coverage report will list
   // all target functions as not covered (NONE). For a bootstrapped run (the
   // workdir already has data), these may or may not coincide with the final
   // "latest" report of the previous run, depending on how the runs are
   // configured (the same number of shards, for example).
   if (env_.num_runs != 0) MaybeGenerateTelemetry("initial", "Before fuzzing");

   // Reset fuzz_start_time_ and num_runs_, so that the pre-init work doesn't
   // affect them.
   fuzz_start_time_ = absl::Now();
   num_runs_ = 0;

   // num_runs / batch_size, rounded up.
   size_t number_of_batches = env_.num_runs / env_.batch_size;
   if (env_.num_runs % env_.batch_size != 0) ++number_of_batches;
   size_t new_runs = 0;
   size_t corpus_size_at_last_prune = corpus_.NumActive();
   for (size_t batch_index = 0; batch_index < number_of_batches; batch_index++) {
     if (ShouldStop()) break;
     FUZZTEST_CHECK_LT(new_runs, env_.num_runs);
     auto remaining_runs = env_.num_runs - new_runs;
     auto batch_size = std::min(env_.batch_size, remaining_runs);
     std::vector<MutationInputRef> mutation_inputs;
     mutation_inputs.reserve(env_.mutate_batch_size);
     for (size_t i = 0; i < env_.mutate_batch_size; i++) {
       const auto &corpus_record = env_.use_corpus_weights
                                       ? corpus_.WeightedRandom(rng_())
                                       : corpus_.UniformRandom(rng_());
       mutation_inputs.push_back(
           MutationInputRef{corpus_record.data, &corpus_record.metadata});
     }

     const std::vector<ByteArray> mutants =
         user_callbacks_.Mutate(mutation_inputs, batch_size);
     if (ShouldStop()) break;

     bool gained_new_coverage =
         RunBatch(mutants, corpus_file.get(), features_file.get(), nullptr);
     new_runs += mutants.size();

     if (gained_new_coverage) {
       UpdateAndMaybeLogStats("new-feature", 1);
     } else if (((batch_index - 1) & batch_index) == 0) {
       // Log if batch_index is a power of two.
       UpdateAndMaybeLogStats("pulse", 1);
     }

     // Dump the intermediate telemetry files.
     MaybeGenerateTelemetryAfterBatch("latest", batch_index);

     if (env_.load_other_shard_frequency != 0 && batch_index != 0 &&
         (batch_index % env_.load_other_shard_frequency) == 0 &&
         env_.total_shards > 1) {
       size_t rand = rng_() % (env_.total_shards - 1);
       size_t other_shard_index =
           (env_.my_shard_index + 1 + rand) % env_.total_shards;
       FUZZTEST_CHECK_NE(other_shard_index, env_.my_shard_index);
       LoadShard(env_, other_shard_index, /*rerun=*/false);
     }

     // Prune if we added enough new elements since last prune.
     if (env_.prune_frequency != 0 &&
         corpus_.NumActive() >
             corpus_size_at_last_prune + env_.prune_frequency) {
       if (env_.use_coverage_frontier) coverage_frontier_.Compute(corpus_);
       corpus_.Prune(fs_, coverage_frontier_, env_.max_corpus_size, rng_);
       corpus_size_at_last_prune = corpus_.NumActive();
     }
   }
   if (env_.persistent_mode) {
     user_callbacks_.CleanUpPersistentMode();
   }

   // The tests rely on this stat being logged last.
   UpdateAndMaybeLogStats("end-fuzz", 0);

   // If we've fuzzed anything, dump the final telemetry files.
   if (env_.num_runs != 0) MaybeGenerateTelemetry("final", "After fuzzing");
 }

 void Centipede::ReportCrash(std::string_view binary,
                             const std::vector<ByteArray> &input_vec,
                             const BatchResult &batch_result) {
   FUZZTEST_CHECK_EQ(input_vec.size(), batch_result.results().size());

   const size_t suspect_input_idx = std::clamp<size_t>(
       batch_result.num_outputs_read(), 0, input_vec.size() - 1);
   auto log_execution_failure = [&](std::string_view log_prefix) {
     absl::MutexLock lock(&GetExecutionLoggingMutex());
     FUZZTEST_LOG(INFO)
         << log_prefix << "Batch execution failed:"
         << "\nBinary               : " << binary
         << "\nExit code            : " << batch_result.exit_code()
         << "\nFailure              : " << batch_result.failure_description()
         << "\nSignature            : "
         << AsPrintableString(AsByteSpan(batch_result.failure_signature()),
                              /*max_len=*/32)
         << "\nNumber of inputs     : " << input_vec.size()
         << "\nNumber of inputs read: " << batch_result.num_outputs_read()
         << (batch_result.IsSetupFailure()
                 ? ""
                 : absl::StrCat("\nSuspect input index  : ", suspect_input_idx))
         << "\nCrash log            :\n\n";
     for (const auto &log_line :
          absl::StrSplit(absl::StripAsciiWhitespace(batch_result.log()), '\n')) {
       FUZZTEST_LOG(INFO).NoPrefix() << "CRASH LOG: " << log_line;
     }
     FUZZTEST_LOG(INFO).NoPrefix() << "\n";
   };

   if (batch_result.IsIgnoredFailure()) {
     FUZZTEST_LOG(INFO) << "Skip further processing of "
                        << batch_result.failure_description();
     return;
   }

   if (batch_result.IsSkippedTest()) {
     log_execution_failure("Skipped Test: ");
     FUZZTEST_LOG(INFO) << "Requesting early stop due to skipped test.";
     RequestEarlyStop(EXIT_SUCCESS);
     return;
   }

   if (batch_result.IsSetupFailure()) {
     log_execution_failure("Test Setup Failure: ");
     FUZZTEST_LOG(INFO)
         << "Requesting early stop due to setup failure in the test.";
     RequestEarlyStop(EXIT_FAILURE);
     return;
   }

   // Skip reporting only if RequestEarlyStop is called - still reporting if time
   // runs out.
   if (EarlyStopRequested()) return;

   if (++num_crashes_ > env_.max_num_crash_reports) return;

   const std::string log_prefix =
       absl::StrCat("ReportCrash[", num_crashes_, "]: ");
   log_execution_failure(log_prefix);

   FUZZTEST_LOG_IF(INFO, num_crashes_ == env_.max_num_crash_reports)
       << log_prefix
       << "Reached --max_num_crash_reports: further reports will be suppressed";

   if (batch_result.failure_description() == kExecutionFailurePerBatchTimeout) {
     FUZZTEST_LOG(INFO)
         << log_prefix
         << "Failure applies to entire batch: not executing inputs "
            "one-by-one, trying to find the reproducer";
     return;
   }

   // Determine the optimal order of the inputs to try to maximize the chances of
   // finding the reproducer fast.
   std::vector<size_t> input_idxs_to_try;
   // Prioritize the presumed crasher by inserting it in front of everything
   // else.
   input_idxs_to_try.push_back(suspect_input_idx);
   if (!env_.batch_triage_suspect_only) {
     // TODO(b/274705740): When the bug is fixed, set `input_idxs_to_try`'s size
     // to `suspect_input_idx + 1`.
     input_idxs_to_try.resize(input_vec.size() + 1);
     // Keep the suspect at the old location, too, in case the target was
     // primed for a crash by the sequence of inputs that preceded the crasher.
     std::iota(input_idxs_to_try.begin() + 1, input_idxs_to_try.end(), 0);
   } else {
     FUZZTEST_LOG(INFO)
         << log_prefix
         << "Skip finding the reproducer from the inputs other than the suspect";
   }

   // Try inputs one-by-one in the determined order.
   FUZZTEST_LOG(INFO)
       << log_prefix
       << "Executing inputs one-by-one, trying to find the reproducer";
   for (auto input_idx : input_idxs_to_try) {
     if (ShouldStop()) return;
     const auto &one_input = input_vec[input_idx];
     BatchResult one_input_batch_result;
     if (!user_callbacks_.Execute(binary, {one_input}, one_input_batch_result)) {
       auto hash = Hash(one_input);
       auto crash_dir = wd_.CrashReproducerDirPaths().MyShard();
       FUZZTEST_CHECK_OK(RemoteMkdir(crash_dir));
       std::string input_file_path = std::filesystem::path(crash_dir) / hash;
       auto crash_metadata_dir = wd_.CrashMetadataDirPaths().MyShard();
       FUZZTEST_CHECK_OK(RemoteMkdir(crash_metadata_dir));
       std::string crash_metadata_path_prefix =
           std::filesystem::path(crash_metadata_dir) / hash;
       FUZZTEST_LOG(INFO)
           << log_prefix << "Detected crash-reproducing input:"
           << "\nInput index    : " << input_idx << "\nInput bytes    : "
           << AsPrintableString(one_input, /*max_len=*/32)
           << "\nExit code      : " << one_input_batch_result.exit_code()
           << "\nFailure        : "
           << one_input_batch_result.failure_description()
           << "\nSignature      : "
           << AsPrintableString(
                  AsByteSpan(one_input_batch_result.failure_signature()),
                  /*max_len=*/32)
           << "\nSaving input to: " << input_file_path << "\nSaving crash"  //
           << "\nmetadata to    : " << crash_metadata_path_prefix << ".*";
       FUZZTEST_CHECK_OK(RemoteFileSetContents(input_file_path, one_input));
       FUZZTEST_CHECK_OK(RemoteFileSetContents(
           absl::StrCat(crash_metadata_path_prefix, ".desc"),
           one_input_batch_result.failure_description()));
       FUZZTEST_CHECK_OK(RemoteFileSetContents(
           absl::StrCat(crash_metadata_path_prefix, ".sig"),
           one_input_batch_result.failure_signature()));
       return;
     }
   }

   FUZZTEST_LOG(INFO) << log_prefix
                      << "Crash was not observed when running inputs one-by-one";

   // There will be cases when several inputs collectively cause a crash, but no
   // single input does. Handle this by writing out the inputs from the batch
   // between 0 and `suspect_input_idx` (inclusive) as individual files under the
   // <--workdir>/crash/crashing_batch-<HASH_OF_SUSPECT_INPUT> directory.
   // TODO(bookholt): Check for repro by re-running the whole batch.
   // TODO(ussuri): Consolidate the crash reproduction logic here and above.
   // TODO(ussuri): This can create a lot of tiny files. Write to a single
   //  shard-like corpus file instead.
   const auto &suspect_input = input_vec[suspect_input_idx];
   auto suspect_hash = Hash(suspect_input);
   auto crash_dir = wd_.CrashReproducerDirPaths().MyShard();
   FUZZTEST_CHECK_OK(RemoteMkdir(crash_dir));
   std::string crashing_batch_name =
       absl::StrCat("crashing_batch-", suspect_hash);
   std::string save_dir = std::filesystem::path(crash_dir) / crashing_batch_name;
   FUZZTEST_CHECK_OK(RemoteMkdir(save_dir));
   FUZZTEST_LOG(INFO) << log_prefix
                      << "Saving used inputs from batch to: " << save_dir;
   for (int i = 0; i <= suspect_input_idx; ++i) {
     const auto &one_input = input_vec[i];
     auto hash = Hash(one_input);
     std::string file_path = std::filesystem::path(save_dir).append(
         absl::StrFormat("input-%010d-%s", i, hash));
     FUZZTEST_CHECK_OK(RemoteFileSetContents(file_path, one_input));
   }
   auto crash_metadata_dir = wd_.CrashMetadataDirPaths().MyShard();
   FUZZTEST_CHECK_OK(RemoteMkdir(crash_metadata_dir));
   std::string crash_metadata_file_path =
       std::filesystem::path(crash_metadata_dir) / crashing_batch_name;
   FUZZTEST_LOG(INFO) << log_prefix << "Saving crash metadata to: "
                      << crash_metadata_file_path;
   FUZZTEST_CHECK_OK(RemoteFileSetContents(crash_metadata_file_path,
                                           batch_result.failure_description()));
 }

 }  // namespace fuzztest::internal