Save --analyze coverage information to a proto

PiperOrigin-RevId: 561133459
diff --git a/centipede/BUILD b/centipede/BUILD
index 07db178..45b3010 100644
--- a/centipede/BUILD
+++ b/centipede/BUILD
@@ -77,6 +77,17 @@
     deps = [":seed_corpus_config_proto"],
 )
 
+# Proto representation of coverage reports
+proto_library(
+    name = "coverage_proto",
+    srcs = ["coverage.proto"],
+)
+
+cc_proto_library(
+    name = "coverage_cc_proto",
+    deps = [":coverage_proto"],
+)
+
 ################################################################################
 #                             C++ libraries
 ################################################################################
@@ -314,9 +325,12 @@
         ":binary_info",
         ":control_flow",
         ":corpus",
+        ":coverage_cc_proto",
         ":feature",
         ":logging",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/centipede/analyze_corpora.cc b/centipede/analyze_corpora.cc
index d4a4bbc..60680a7 100644
--- a/centipede/analyze_corpora.cc
+++ b/centipede/analyze_corpora.cc
@@ -15,17 +15,63 @@
 #include "./centipede/analyze_corpora.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <fstream>
+#include <ios>
+#include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "./centipede/control_flow.h"
 #include "./centipede/corpus.h"
+#include "./centipede/coverage.pb.h"
 #include "./centipede/feature.h"
 #include "./centipede/logging.h"
 
 namespace centipede {
+
+namespace {
+
+CoverageReport ToCoverageReport(const std::vector<size_t> &pcs,
+                                const SymbolTable &symbols) {
+  CoverageReport result;
+  for (const size_t pc : pcs) {
+    CoverageReport::Edge *edge = result.add_covered_edges();
+    edge->set_function_name(symbols.func(pc));
+
+    std::string file_line_column = symbols.location(pc);
+    std::vector<std::string> file_line_column_split =
+        absl::StrSplit(file_line_column, ':');
+    CHECK(file_line_column_split.size() == 3)
+        << "Unexpected number of elements when splitting source location: "
+        << file_line_column;
+
+    edge->set_file_name(file_line_column_split[0]);
+
+    int line;
+    CHECK(absl::SimpleAtoi(file_line_column_split[1], &line))
+        << "Unable to convert line number to integer: "
+        << file_line_column_split[1];
+    edge->set_line(line);
+
+    int column;
+    CHECK(absl::SimpleAtoi(file_line_column_split[2], &column))
+        << "Unable to convert column number to integer: "
+        << file_line_column_split[2];
+    edge->set_column(column);
+  }
+  return result;
+}
+
+}  // namespace
+
 void AnalyzeCorpora(const BinaryInfo &binary_info,
                     const std::vector<CorpusRecord> &a,
-                    const std::vector<CorpusRecord> &b) {
+                    const std::vector<CorpusRecord> &b,
+                    std::string_view analyze_report_path) {
   // `a_pcs` will contain all PCs covered by `a`.
   absl::flat_hash_set<size_t> a_pcs;
   for (const auto &record : a) {
@@ -40,6 +86,7 @@
   // `b_unique_indices` are indices of inputs that have PCs from `b_only_pcs`.
   // `b_shared_indices` are indices of all other inputs from `b`.
   absl::flat_hash_set<size_t> b_only_pcs;
+  absl::flat_hash_set<size_t> b_pcs;
   std::vector<size_t> b_shared_indices, b_unique_indices;
   for (size_t i = 0; i < b.size(); ++i) {
     const auto &record = b[i];
@@ -47,6 +94,7 @@
     for (const auto &feature : record.features) {
       if (!feature_domains::kPCs.Contains(feature)) continue;
       auto pc = ConvertPCFeatureToPcIndex(feature);
+      b_pcs.insert(pc);
       if (a_pcs.contains(pc)) continue;
       b_only_pcs.insert(pc);
       has_b_only = true;
@@ -56,20 +104,37 @@
     else
       b_shared_indices.push_back(i);
   }
+
+  absl::flat_hash_set<size_t> a_only_pcs;
+  for (const auto &record : a) {
+    for (const auto &feature : record.features) {
+      if (!feature_domains::kPCs.Contains(feature)) continue;
+      auto pc = ConvertPCFeatureToPcIndex(feature);
+      if (b_pcs.contains(pc)) continue;
+      a_only_pcs.insert(pc);
+    }
+  }
   LOG(INFO) << VV(a.size()) << VV(b.size()) << VV(a_pcs.size())
-            << VV(b_only_pcs.size()) << VV(b_shared_indices.size())
-            << VV(b_unique_indices.size());
+            << VV(a_only_pcs.size()) << VV(b_only_pcs.size())
+            << VV(b_shared_indices.size()) << VV(b_unique_indices.size());
 
   const auto &pc_table = binary_info.pc_table;
   const auto &symbols = binary_info.symbols;
   CoverageLogger coverage_logger(pc_table, symbols);
 
-  CoverageFrontier frontier_a(binary_info);
-  frontier_a.Compute(a);
+  // TODO: these cause a CHECK-fail
+  // CoverageFrontier frontier_a(binary_info);
+  // frontier_a.Compute(a);
 
   // TODO(kcc): use frontier_a to show the most interesting b-only PCs.
 
-  // Sort b-only PCs to print them in the canonical order, as in pc_table.
+  // Sort PCs to print them in the canonical order, as in pc_table.
+  std::vector<size_t> a_pcs_vec{a_pcs.begin(), a_pcs.end()};
+  std::sort(a_pcs_vec.begin(), a_pcs_vec.end());
+  std::vector<size_t> b_pcs_vec{b_pcs.begin(), b_pcs.end()};
+  std::sort(b_pcs_vec.begin(), b_pcs_vec.end());
+  std::vector<size_t> a_only_pcs_vec{a_only_pcs.begin(), a_only_pcs.end()};
+  std::sort(a_only_pcs_vec.begin(), a_only_pcs_vec.end());
   std::vector<size_t> b_only_pcs_vec{b_only_pcs.begin(), b_only_pcs.end()};
   std::sort(b_only_pcs_vec.begin(), b_only_pcs_vec.end());
 
@@ -90,6 +155,24 @@
     auto str = coverage_logger.ObserveAndDescribeIfNew(pc);
     if (!str.empty()) LOG(INFO).NoPrefix() << str;
   }
+
+  if (!analyze_report_path.empty()) {
+    AnalyzeReport analyze_report;
+    CoverageReport *a_coverage = analyze_report.mutable_a_coverage();
+    *a_coverage = ToCoverageReport(a_pcs_vec, symbols);
+    CoverageReport *b_coverage = analyze_report.mutable_b_coverage();
+    *b_coverage = ToCoverageReport(b_pcs_vec, symbols);
+    CoverageReport *a_only_coverage = analyze_report.mutable_a_only_coverage();
+    *a_only_coverage = ToCoverageReport(a_only_pcs_vec, symbols);
+    CoverageReport *b_only_coverage = analyze_report.mutable_b_only_coverage();
+    *b_only_coverage = ToCoverageReport(b_only_pcs_vec, symbols);
+
+    std::fstream f(std::string{analyze_report_path},
+                   std::ios::binary | std::ios::out);
+    CHECK(f) << "Unable to open AnalyzeReport path: " << analyze_report_path;
+    analyze_report.SerializeToOstream(&f);
+    f.close();
+  }
 }
 
 }  // namespace centipede
diff --git a/centipede/analyze_corpora.h b/centipede/analyze_corpora.h
index 9843562..cef41a9 100644
--- a/centipede/analyze_corpora.h
+++ b/centipede/analyze_corpora.h
@@ -23,7 +23,8 @@
 // Analyzes two corpora, `a` and `b`, reports the differences.
 void AnalyzeCorpora(const BinaryInfo &binary_info,
                     const std::vector<CorpusRecord> &a,
-                    const std::vector<CorpusRecord> &b);
+                    const std::vector<CorpusRecord> &b,
+                    std::string_view analyze_report_path);
 
 }  // namespace centipede
 
diff --git a/centipede/centipede_interface.cc b/centipede/centipede_interface.cc
index 9ea4a10..5dfe2e4 100644
--- a/centipede/centipede_interface.cc
+++ b/centipede/centipede_interface.cc
@@ -178,7 +178,7 @@
     LOG(INFO) << "corpus size " << corpus.size();
   }
   CHECK_EQ(corpora.size(), 2);
-  AnalyzeCorpora(binary_info, corpora[0], corpora[1]);
+  AnalyzeCorpora(binary_info, corpora[0], corpora[1], env.analyze_report);
   return EXIT_SUCCESS;
 }
 
diff --git a/centipede/coverage.proto b/centipede/coverage.proto
new file mode 100644
index 0000000..b27ea64
--- /dev/null
+++ b/centipede/coverage.proto
@@ -0,0 +1,51 @@
+// Copyright 2023 The Centipede Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Proto representation of Centipede coverage reports.
+syntax = "proto3";
+
+package centipede;
+
+// Describes the covered edges and functions from, e.g., a fuzzing run.
+//
+// Next tag: 2
+message CoverageReport {
+  // Describes a particular location within a program.
+  //
+  // Next tag: 5
+  message Edge {
+    // The name of the function where the edge resides.
+    string function_name = 1;
+    // The name of the file where the edge resides.
+    string file_name = 2;
+    // The line in `file_name` where the edge resides.
+    int32 line = 3;
+    // The column in `file_name` where the edge resides.
+    int32 column = 4;
+  }
+  repeated Edge covered_edges = 1;
+}
+
+// Describes the comparison of the coverage of two corpora A and B (--analyze).
+// Next tag: 5
+message AnalyzeReport {
+  // Coverage from corpus A.
+  CoverageReport a_coverage = 1;
+  // Coverage from corpus B.
+  CoverageReport b_coverage = 2;
+  // Coverage found in A but not B
+  CoverageReport a_only_coverage = 3;
+  // Coverage found in B but not A
+  CoverageReport b_only_coverage = 4;
+}
diff --git a/centipede/environment.cc b/centipede/environment.cc
index 753c621..cccdb16 100644
--- a/centipede/environment.cc
+++ b/centipede/environment.cc
@@ -337,6 +337,9 @@
           " as argv and analyze differences between those corpora."
           " Used by the Centipede developers to improve the engine. "
           " TODO(kcc) implement. ");
+ABSL_FLAG(std::string, analyze_report, "",
+          "If set, --analyze will output a binary proto to the provided path "
+          "containing the AnalyzeReport");
 ABSL_FLAG(std::string, dictionary, "",
           "A comma-separated list of paths to dictionary files. The dictionary "
           "file is either in AFL/libFuzzer plain text format or in the binary "
@@ -472,6 +475,7 @@
       for_each_blob(absl::GetFlag(FLAGS_for_each_blob)),
       experiment(absl::GetFlag(FLAGS_experiment)),
       analyze(absl::GetFlag(FLAGS_analyze)),
+      analyze_report(absl::GetFlag(FLAGS_analyze_report)),
       exit_on_crash(absl::GetFlag(FLAGS_exit_on_crash)),
       max_num_crash_reports(absl::GetFlag(FLAGS_num_crash_reports)),
       minimize_crash_file_path(absl::GetFlag(FLAGS_minimize_crash)),
diff --git a/centipede/environment.h b/centipede/environment.h
index 9d36950..84b97e2 100644
--- a/centipede/environment.h
+++ b/centipede/environment.h
@@ -105,6 +105,7 @@
   std::string for_each_blob;
   std::string experiment;
   bool analyze;
+  std::string analyze_report;
   bool exit_on_crash;
   size_t max_num_crash_reports;
   std::string minimize_crash_file_path;