Replicate generic hardware events on all CPU PMUs (#2123)

On systems with more than one PMU for the CPUs (e.g. Apple M series SOCs),
generic hardware events are only created for an arbitrary PMU. Usually
this is the big cluster's PMU, which can cause inaccuracies when the
process is scheduled onto a little core. To fix this, teach PerfCounters
to register generic hardware events on all CPU PMUs.

CPU PMUs are identified using the same method as perf.
diff --git a/src/perf_counters.cc b/src/perf_counters.cc
index f47aa7b..e6f2209 100644
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@@ -16,9 +16,15 @@
 
 #include <cstring>
 #include <memory>
+#include <optional>
 #include <vector>
 
 #if defined HAVE_LIBPFM
+#include <dirent.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <sys/stat.h>
+
 #include "perfmon/pfmlib.h"
 #include "perfmon/pfmlib_perf_event.h"
 #endif
@@ -68,7 +74,7 @@
 
 bool PerfCounters::IsCounterSupported(const std::string& name) {
   Initialize();
-  perf_event_attr_t attr;
+  perf_event_attr attr;
   std::memset(&attr, 0, sizeof(attr));
   pfm_perf_encode_arg_t arg;
   std::memset(&arg, 0, sizeof(arg));
@@ -79,6 +85,55 @@
   return (ret == PFM_SUCCESS);
 }
 
+static std::optional<std::vector<uint64_t>> QueryCPUPMUTypes() {
+  std::vector<uint64_t> types;
+  DIR* dir = opendir("/sys/bus/event_source/devices");
+  if (!dir) {
+    return std::nullopt;
+  }
+  while (dirent* ent = readdir(dir)) {
+    std::string_view name_str = ent->d_name;
+    auto node_path = [&](const char* node) {
+      return std::string("/sys/bus/event_source/devices/") + ent->d_name + "/" +
+             node;
+    };
+    struct stat st;
+    if (name_str == "cpu" || name_str == "cpum_cf" ||
+        stat(node_path("cpus").c_str(), &st) == 0 || errno != ENOENT) {
+      int type_fd = open(node_path("type").c_str(), O_RDONLY);
+      if (type_fd < 0) {
+        closedir(dir);
+        return std::nullopt;
+      }
+      char type_str[32] = {};
+      ssize_t res = read(type_fd, type_str, sizeof(type_str) - 1);
+      close(type_fd);
+      if (res < 0) {
+        closedir(dir);
+        return std::nullopt;
+      }
+      uint64_t type;
+      if (sscanf(type_str, "%" PRIu64, &type) != 1) {
+        closedir(dir);
+        return std::nullopt;
+      }
+      types.push_back(type);
+    }
+  }
+  closedir(dir);
+  return types;
+}
+
+static std::vector<uint64_t> GetPMUTypesForEvent(const perf_event_attr& attr) {
+  // Replicate generic hardware events on all CPU PMUs.
+  if (attr.type == PERF_TYPE_HARDWARE && attr.config < PERF_COUNT_HW_MAX) {
+    if (auto types = QueryCPUPMUTypes()) {
+      return *types;
+    }
+  }
+  return {0};
+}
+
 PerfCounters PerfCounters::Create(
     const std::vector<std::string>& counter_names) {
   if (!counter_names.empty()) {
@@ -158,50 +213,54 @@
     attr.read_format = PERF_FORMAT_GROUP;  //| PERF_FORMAT_TOTAL_TIME_ENABLED |
                                            // PERF_FORMAT_TOTAL_TIME_RUNNING;
 
-    int id = -1;
-    while (id < 0) {
-      static constexpr size_t kNrOfSyscallRetries = 5;
-      // Retry syscall as it was interrupted often (b/64774091).
-      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
-           ++num_retries) {
-        id = perf_event_open(&attr, 0, -1, group_id, 0);
-        if (id >= 0 || errno != EINTR) {
-          break;
+    uint64_t base_config = attr.config;
+    for (uint64_t pmu : GetPMUTypesForEvent(attr)) {
+      attr.config = (pmu << PERF_PMU_TYPE_SHIFT) | base_config;
+      int id = -1;
+      while (id < 0) {
+        static constexpr size_t kNrOfSyscallRetries = 5;
+        // Retry syscall as it was interrupted often (b/64774091).
+        for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+             ++num_retries) {
+          id = perf_event_open(&attr, 0, -1, group_id, 0);
+          if (id >= 0 || errno != EINTR) {
+            break;
+          }
+        }
+        if (id < 0) {
+          // If the file descriptor is negative we might have reached a limit
+          // in the current group. Set the group_id to -1 and retry
+          if (group_id >= 0) {
+            // Create a new group
+            group_id = -1;
+          } else {
+            // At this point we have already retried to set a new group id and
+            // failed. We then give up.
+            break;
+          }
         }
       }
+
+      // We failed to get a new file descriptor. We might have reached a hard
+      // hardware limit that cannot be resolved even with group multiplexing
       if (id < 0) {
-        // If the file descriptor is negative we might have reached a limit
-        // in the current group. Set the group_id to -1 and retry
-        if (group_id >= 0) {
-          // Create a new group
-          group_id = -1;
-        } else {
-          // At this point we have already retried to set a new group id and
-          // failed. We then give up.
-          break;
-        }
+        GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                                 "for performance counter "
+                              << name << ". Ignoring\n";
+
+        // We give up on this counter but try to keep going
+        // as the others would be fine
+        continue;
       }
+      if (group_id < 0) {
+        // This is a leader, store and assign it to the current file descriptor
+        leader_ids.push_back(id);
+        group_id = id;
+      }
+      // This is a valid counter, add it to our descriptor's list
+      counter_ids.push_back(id);
+      valid_names.push_back(name);
     }
-
-    // We failed to get a new file descriptor. We might have reached a hard
-    // hardware limit that cannot be resolved even with group multiplexing
-    if (id < 0) {
-      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
-                               "for performance counter "
-                            << name << ". Ignoring\n";
-
-      // We give up on this counter but try to keep going
-      // as the others would be fine
-      continue;
-    }
-    if (group_id < 0) {
-      // This is a leader, store and assign it to the current file descriptor
-      leader_ids.push_back(id);
-      group_id = id;
-    }
-    // This is a valid counter, add it to our descriptor's list
-    counter_ids.push_back(id);
-    valid_names.push_back(name);
   }
 
   // Loop through all group leaders activating them
diff --git a/src/perf_counters.h b/src/perf_counters.h
index bf5eb6b..4e45344 100644
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@@ -152,7 +152,7 @@
 
   size_t num_counters() const { return counters_.num_counters(); }
 
-  std::vector<std::string> names() const { return counters_.names(); }
+  const std::vector<std::string>& names() const { return counters_.names(); }
 
   BENCHMARK_ALWAYS_INLINE bool Start() {
     if (num_counters() == 0) return true;
diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc
index 5de262f..6c923be 100644
--- a/test/perf_counters_gtest.cc
+++ b/test/perf_counters_gtest.cc
@@ -27,12 +27,22 @@
   EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
 }
 
+// Generic events will have as many counters as there are CPU PMUs, and each
+// will have the same name. In order to make these tests independent of the
+// number of CPU PMUs in the system, we uniquify the counter names before
+// testing them.
+static std::set<std::string> UniqueCounterNames(const PerfCounters& pc) {
+  std::set<std::string> names{pc.names().begin(), pc.names().end()};
+  return names;
+}
+
 TEST(PerfCountersTest, OneCounter) {
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Performance counters not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
-  EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
+  EXPECT_EQ(
+      UniqueCounterNames(PerfCounters::Create({kGenericPerfEvent1})).size(), 1);
 }
 
 TEST(PerfCountersTest, NegativeTest) {
@@ -53,48 +63,101 @@
     // number of counters has to be two, not zero
     auto counter =
         PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
-    EXPECT_EQ(counter.num_counters(), 2);
-    EXPECT_EQ(counter.names(), std::vector<std::string>(
-                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
+    auto names = UniqueCounterNames(counter);
+    EXPECT_EQ(names.size(), 2);
+    EXPECT_EQ(names,
+              std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
   }
   {
     // Try sneaking in an outrageous counter, like a fat finger mistake
     auto counter = PerfCounters::Create(
         {kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1});
-    EXPECT_EQ(counter.num_counters(), 2);
-    EXPECT_EQ(counter.names(), std::vector<std::string>(
-                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
+    auto names = UniqueCounterNames(counter);
+    EXPECT_EQ(names.size(), 2);
+    EXPECT_EQ(names,
+              std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
   }
   {
     // Finally try a golden input - it should like both of them
-    EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2})
-                  .num_counters(),
+    EXPECT_EQ(UniqueCounterNames(PerfCounters::Create(
+                                     {kGenericPerfEvent1, kGenericPerfEvent2}))
+                  .size(),
               2);
   }
   {
     // Add a bad apple in the end of the chain to check the edges
     auto counter = PerfCounters::Create(
         {kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"});
-    EXPECT_EQ(counter.num_counters(), 2);
-    EXPECT_EQ(counter.names(), std::vector<std::string>(
-                                   {kGenericPerfEvent1, kGenericPerfEvent2}));
+    auto names = UniqueCounterNames(counter);
+    EXPECT_EQ(names.size(), 2);
+    EXPECT_EQ(names,
+              std::set<std::string>({kGenericPerfEvent1, kGenericPerfEvent2}));
   }
 }
 
+static std::map<std::string, uint64_t> SnapshotAndCombine(
+    PerfCounters& counters) {
+  PerfCounterValues values(counters.num_counters());
+  std::map<std::string, uint64_t> value_map;
+
+  if (counters.Snapshot(&values)) {
+    for (size_t i = 0; i != counters.num_counters(); ++i) {
+      value_map[counters.names()[i]] += values[i];
+    }
+  }
+  return value_map;
+}
+
 TEST(PerfCountersTest, Read1Counter) {
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   auto counters = PerfCounters::Create({kGenericPerfEvent1});
-  EXPECT_EQ(counters.num_counters(), 1);
-  PerfCounterValues values1(1);
-  EXPECT_TRUE(counters.Snapshot(&values1));
-  EXPECT_GT(values1[0], 0);
-  PerfCounterValues values2(1);
-  EXPECT_TRUE(counters.Snapshot(&values2));
-  EXPECT_GT(values2[0], 0);
-  EXPECT_GT(values2[0], values1[0]);
+  auto values1 = SnapshotAndCombine(counters);
+  EXPECT_EQ(values1.size(), 1);
+  EXPECT_GT(values1.begin()->second, 0);
+  auto values2 = SnapshotAndCombine(counters);
+  EXPECT_EQ(values2.size(), 1);
+  EXPECT_GT(values2.begin()->second, 0);
+  EXPECT_GT(values2.begin()->second, values1.begin()->second);
+}
+
+TEST(PerfCountersTest, Read1CounterEachCPU) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+#ifdef __linux__
+  EXPECT_TRUE(PerfCounters::Initialize());
+
+  cpu_set_t saved_set;
+  if (sched_getaffinity(0, sizeof(saved_set), &saved_set) != 0) {
+    // This can happen e.g. if there are more than CPU_SETSIZE CPUs.
+    GTEST_SKIP() << "Could not save CPU affinity mask.\n";
+  }
+
+  for (size_t cpu = 0; cpu != CPU_SETSIZE; ++cpu) {
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    CPU_SET(cpu, &set);
+    if (sched_setaffinity(0, sizeof(set), &set) != 0) {
+      break;
+    }
+
+    auto counters = PerfCounters::Create({kGenericPerfEvent1});
+    auto values1 = SnapshotAndCombine(counters);
+    EXPECT_EQ(values1.size(), 1);
+    EXPECT_GT(values1.begin()->second, 0);
+    auto values2 = SnapshotAndCombine(counters);
+    EXPECT_EQ(values2.size(), 1);
+    EXPECT_GT(values2.begin()->second, 0);
+    EXPECT_GT(values2.begin()->second, values1.begin()->second);
+  }
+
+  EXPECT_EQ(sched_setaffinity(0, sizeof(saved_set), &saved_set), 0);
+#else
+  GTEST_SKIP() << "Test skipped on non-Linux.\n";
+#endif
 }
 
 TEST(PerfCountersTest, Read2Counters) {
@@ -104,15 +167,17 @@
   EXPECT_TRUE(PerfCounters::Initialize());
   auto counters =
       PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
-  EXPECT_EQ(counters.num_counters(), 2);
-  PerfCounterValues values1(2);
-  EXPECT_TRUE(counters.Snapshot(&values1));
-  EXPECT_GT(values1[0], 0);
-  EXPECT_GT(values1[1], 0);
-  PerfCounterValues values2(2);
-  EXPECT_TRUE(counters.Snapshot(&values2));
-  EXPECT_GT(values2[0], 0);
-  EXPECT_GT(values2[1], 0);
+  auto values1 = SnapshotAndCombine(counters);
+  EXPECT_EQ(values1.size(), 2);
+  for (auto& kv : values1) {
+    EXPECT_GT(kv.second, 0);
+  }
+  auto values2 = SnapshotAndCombine(counters);
+  EXPECT_EQ(values1.size(), 2);
+  for (auto& kv : values2) {
+    EXPECT_GT(kv.second, 0);
+    EXPECT_GT(kv.second, values1[kv.first]);
+  }
 }
 
 TEST(PerfCountersTest, ReopenExistingCounters) {
@@ -127,7 +192,7 @@
   for (auto& counter : counters) {
     counter = PerfCounters::Create(kMetrics);
   }
-  PerfCounterValues values(1);
+  PerfCounterValues values(counters[0].num_counters());
   EXPECT_TRUE(counters[0].Snapshot(&values));
   EXPECT_TRUE(counters[1].Snapshot(&values));
 }
@@ -171,7 +236,8 @@
   size_t max_counters = kMaxCounters;
   for (size_t i = 0; i < kMaxCounters; ++i) {
     auto& counter(*perf_counter_measurements[i]);
-    EXPECT_EQ(counter.num_counters(), 1);
+    std::set<std::string> names{counter.names().begin(), counter.names().end()};
+    EXPECT_EQ(names.size(), 1);
     if (!counter.Start()) {
       max_counters = i;
       break;
@@ -212,8 +278,8 @@
   return sum;
 }
 
-void measure(size_t threadcount, PerfCounterValues* before,
-             PerfCounterValues* after) {
+void measure(size_t threadcount, std::map<std::string, uint64_t>* before,
+             std::map<std::string, uint64_t>* after) {
   BM_CHECK_NE(before, nullptr);
   BM_CHECK_NE(after, nullptr);
   std::vector<std::thread> threads(threadcount);
@@ -229,11 +295,11 @@
   for (auto& t : threads) {
     t = std::thread(work);
   }
-  counters.Snapshot(before);
+  *before = SnapshotAndCombine(counters);
   for (auto& t : threads) {
     t.join();
   }
-  counters.Snapshot(after);
+  *after = SnapshotAndCombine(counters);
 }
 
 TEST(PerfCountersTest, MultiThreaded) {
@@ -241,8 +307,7 @@
     GTEST_SKIP() << "Test skipped because libpfm is not supported.";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
-  PerfCounterValues before(2);
-  PerfCounterValues after(2);
+  std::map<std::string, uint64_t> before, after;
 
   // Notice that this test will work even if we taskset it to a single CPU
   // In this case the threads will run sequentially
@@ -250,15 +315,19 @@
   // instructions
   measure(2, &before, &after);
   std::vector<double> Elapsed2Threads{
-      static_cast<double>(after[0] - before[0]),
-      static_cast<double>(after[1] - before[1])};
+      static_cast<double>(after[kGenericPerfEvent1] -
+                          before[kGenericPerfEvent1]),
+      static_cast<double>(after[kGenericPerfEvent2] -
+                          before[kGenericPerfEvent2])};
 
   // Start four threads and measure the number of combined cycles and
   // instructions
   measure(4, &before, &after);
   std::vector<double> Elapsed4Threads{
-      static_cast<double>(after[0] - before[0]),
-      static_cast<double>(after[1] - before[1])};
+      static_cast<double>(after[kGenericPerfEvent1] -
+                          before[kGenericPerfEvent1]),
+      static_cast<double>(after[kGenericPerfEvent2] -
+                          before[kGenericPerfEvent2])};
 
   // The following expectations fail (at least on a beefy workstation with lots
   // of cpus) - it seems that in some circumstances the runtime of 4 threads