No public description

PiperOrigin-RevId: 617653153
diff --git a/centipede/corpus_io.cc b/centipede/corpus_io.cc
index 9cf3ee8..7d5e667 100644
--- a/centipede/corpus_io.cc
+++ b/centipede/corpus_io.cc
@@ -59,21 +59,14 @@
   //  it (but doesn't really need it). Investigate and switch to
   //  `absl::flat_hash_map`.
   std::multimap<std::string /*hash*/, ByteArray /*input*/> hash_to_input;
+
   // Read inputs from the corpus file into `hash_to_input`.
   auto corpus_reader = DefaultBlobFileReaderFactory();
   CHECK_OK(corpus_reader->Open(corpus_path)) << VV(corpus_path);
-  ByteSpan blob;
-  while (corpus_reader->Read(blob).ok()) {
-    std::string hash = Hash(blob);
-    ByteArray input{blob.begin(), blob.end()};
-    hash_to_input.emplace(std::move(hash), std::move(input));
-  }
-
-  RPROF_SNAPSHOT("Read inputs");
 
   // Input counts of various kinds (for logging).
-  const size_t num_inputs = hash_to_input.size();
-  size_t num_inputs_missing_features = num_inputs;
+  size_t num_inputs = 0;
+  size_t num_inputs_missing_features = 0;
   size_t num_inputs_empty_features = 0;
   size_t num_inputs_non_empty_features = 0;
 
@@ -88,34 +81,62 @@
     // only inputs without matching features.
     auto features_reader = DefaultBlobFileReaderFactory();
     CHECK_OK(features_reader->Open(features_path)) << VV(features_path);
-    ByteSpan hash_and_features;
-    while (features_reader->Read(hash_and_features).ok()) {
+    ByteSpan features_blob;
+    while (features_reader->Read(features_blob).ok()) {
       // Every valid feature record must contain the hash at the end.
       // Ignore this record if it is too short.
-      if (hash_and_features.size() < kHashLen) continue;
+      if (features_blob.size() < kHashLen) continue;
+
       FeatureVec features;
-      std::string hash = UnpackFeaturesAndHash(hash_and_features, &features);
-      auto input_node = hash_to_input.extract(hash);
-      if (!input_node.empty()) {
-        --num_inputs_missing_features;
-        if (features.empty()) {
-          // When the features file got created, Centipede did compute features
-          // for the input, but they came up empty. Indicate to the client that
-          // there is no need to recompute by passing this special value.
+      const std::string feature_hash =
+          UnpackFeaturesAndHash(features_blob, &features);
+
+      ByteArray matching_input;
+      if (auto input_node = hash_to_input.extract(feature_hash);
+          !input_node.empty()) {
+        // A matching input has already been scanned in during one of the
+        // previous lookaheads: use it.
+        matching_input = std::move(input_node.mapped());
+      } else {
+        // A matching input has not been found during the previous lookaheads:
+        // perform a new one, storing mismatching inputs into the has map along
+        // the way.
+        ByteSpan input_blob;
+        while (corpus_reader->Read(input_blob).ok()) {
+          ++num_inputs;
+          std::string input_hash = Hash(input_blob);
+          ByteArray input{input_blob.begin(), input_blob.end()};
+          if (input_hash == feature_hash) {
+            matching_input = std::move(input);
+            break;
+          } else {
+            hash_to_input.emplace(std::move(input_hash), std::move(input));
+          }
+        }
+      }
+
+      if (!matching_input.empty()) {
+        if (!features.empty()) {
+          // A "normal" input with non-empty features.
+          ++num_inputs_non_empty_features;
+        } else {
+          // Centipede computed empty features for this input previously.
+          // Indicate to the client that it doesn't need to recompute them by
+          // passing this special value.
           features = {feature_domains::kNoFeature};
           ++num_inputs_empty_features;
-        } else {
-          ++num_inputs_non_empty_features;
         }
-        callback(std::move(input_node.mapped()), std::move(features));
+        callback(std::move(matching_input), std::move(features));
       }
     }
 
     RPROF_SNAPSHOT("Read features & reported input/features pairs");
   }
 
-  // Finally, call `callback` on the remaining inputs without matching features.
-  // This also automatically covers the features file not passed or missing.
+  // Finally, call `callback` on the inputs without matching features, which we
+  // have accumulated during lookaheads. This also automatically covers the case
+  // of a features file not passed or missing.
+  num_inputs_missing_features = hash_to_input.size();
   for (auto &&[hash, input] : hash_to_input) {
     // Indicate to the client that it needs to recompute features for this input
     // by passing an empty value.