Replicate generic hardware events on all CPU PMUs (#2123)
On systems with more than one PMU for the CPUs (e.g. Apple M series SOCs),
generic hardware events are only created for an arbitrary PMU. Usually
this is the big cluster's PMU, which can cause inaccuracies when the
process is scheduled onto a little core. To fix this, teach PerfCounters
to register generic hardware events on all CPU PMUs.
CPU PMUs are identified using the same method as perf.
diff --git a/src/perf_counters.cc b/src/perf_counters.cc
index f47aa7b..e6f2209 100644
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@@ -16,9 +16,15 @@
#include <cstring>
#include <memory>
+#include <optional>
#include <vector>
#if defined HAVE_LIBPFM
+#include <dirent.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <sys/stat.h>
+
#include "perfmon/pfmlib.h"
#include "perfmon/pfmlib_perf_event.h"
#endif
@@ -68,7 +74,7 @@
bool PerfCounters::IsCounterSupported(const std::string& name) {
Initialize();
- perf_event_attr_t attr;
+ perf_event_attr attr;
std::memset(&attr, 0, sizeof(attr));
pfm_perf_encode_arg_t arg;
std::memset(&arg, 0, sizeof(arg));
@@ -79,6 +85,55 @@
return (ret == PFM_SUCCESS);
}
+static std::optional<std::vector<uint64_t>> QueryCPUPMUTypes() {
+ std::vector<uint64_t> types;
+ DIR* dir = opendir("/sys/bus/event_source/devices");
+ if (!dir) {
+ return std::nullopt;
+ }
+ while (dirent* ent = readdir(dir)) {
+ std::string_view name_str = ent->d_name;
+ auto node_path = [&](const char* node) {
+ return std::string("/sys/bus/event_source/devices/") + ent->d_name + "/" +
+ node;
+ };
+ struct stat st;
+ if (name_str == "cpu" || name_str == "cpum_cf" ||
+ stat(node_path("cpus").c_str(), &st) == 0 || errno != ENOENT) {
+ int type_fd = open(node_path("type").c_str(), O_RDONLY);
+ if (type_fd < 0) {
+ closedir(dir);
+ return std::nullopt;
+ }
+ char type_str[32] = {};
+ ssize_t res = read(type_fd, type_str, sizeof(type_str) - 1);
+ close(type_fd);
+ if (res < 0) {
+ closedir(dir);
+ return std::nullopt;
+ }
+ uint64_t type;
+ if (sscanf(type_str, "%" PRIu64, &type) != 1) {
+ closedir(dir);
+ return std::nullopt;
+ }
+ types.push_back(type);
+ }
+ }
+ closedir(dir);
+ return types;
+}
+
+static std::vector<uint64_t> GetPMUTypesForEvent(const perf_event_attr& attr) {
+ // Replicate generic hardware events on all CPU PMUs.
+ if (attr.type == PERF_TYPE_HARDWARE && attr.config < PERF_COUNT_HW_MAX) {
+ if (auto types = QueryCPUPMUTypes()) {
+ return *types;
+ }
+ }
+ return {0};
+}
+
PerfCounters PerfCounters::Create(
const std::vector<std::string>& counter_names) {
if (!counter_names.empty()) {
@@ -158,50 +213,54 @@
attr.read_format = PERF_FORMAT_GROUP; //| PERF_FORMAT_TOTAL_TIME_ENABLED |
// PERF_FORMAT_TOTAL_TIME_RUNNING;
- int id = -1;
- while (id < 0) {
- static constexpr size_t kNrOfSyscallRetries = 5;
- // Retry syscall as it was interrupted often (b/64774091).
- for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
- ++num_retries) {
- id = perf_event_open(&attr, 0, -1, group_id, 0);
- if (id >= 0 || errno != EINTR) {
- break;
+ uint64_t base_config = attr.config;
+ for (uint64_t pmu : GetPMUTypesForEvent(attr)) {
+ attr.config = (pmu << PERF_PMU_TYPE_SHIFT) | base_config;
+ int id = -1;
+ while (id < 0) {
+ static constexpr size_t kNrOfSyscallRetries = 5;
+ // Retry syscall as it was interrupted often (b/64774091).
+ for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+ ++num_retries) {
+ id = perf_event_open(&attr, 0, -1, group_id, 0);
+ if (id >= 0 || errno != EINTR) {
+ break;
+ }
+ }
+ if (id < 0) {
+ // If the file descriptor is negative we might have reached a limit
+ // in the current group. Set the group_id to -1 and retry
+ if (group_id >= 0) {
+ // Create a new group
+ group_id = -1;
+ } else {
+ // At this point we have already retried to set a new group id and
+ // failed. We then give up.
+ break;
+ }
}
}
+
+ // We failed to get a new file descriptor. We might have reached a hard
+ // hardware limit that cannot be resolved even with group multiplexing
if (id < 0) {
- // If the file descriptor is negative we might have reached a limit
- // in the current group. Set the group_id to -1 and retry
- if (group_id >= 0) {
- // Create a new group
- group_id = -1;
- } else {
- // At this point we have already retried to set a new group id and
- // failed. We then give up.
- break;
- }
+ GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+ "for performance counter "
+ << name << ". Ignoring\n";
+
+ // We give up on this counter but try to keep going
+ // as the others would be fine
+ continue;
}
+ if (group_id < 0) {
+ // This is a leader, store and assign it to the current file descriptor
+ leader_ids.push_back(id);
+ group_id = id;
+ }
+ // This is a valid counter, add it to our descriptor's list
+ counter_ids.push_back(id);
+ valid_names.push_back(name);
}
-
- // We failed to get a new file descriptor. We might have reached a hard
- // hardware limit that cannot be resolved even with group multiplexing
- if (id < 0) {
- GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
- "for performance counter "
- << name << ". Ignoring\n";
-
- // We give up on this counter but try to keep going
- // as the others would be fine
- continue;
- }
- if (group_id < 0) {
- // This is a leader, store and assign it to the current file descriptor
- leader_ids.push_back(id);
- group_id = id;
- }
- // This is a valid counter, add it to our descriptor's list
- counter_ids.push_back(id);
- valid_names.push_back(name);
}
// Loop through all group leaders activating them
diff --git a/src/perf_counters.h b/src/perf_counters.h
index bf5eb6b..4e45344 100644
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@@ -152,7 +152,7 @@
size_t num_counters() const { return counters_.num_counters(); }
- std::vector<std::string> names() const { return counters_.names(); }
+ const std::vector<std::string>& names() const { return counters_.names(); }
BENCHMARK_ALWAYS_INLINE bool Start() {
if (num_counters() == 0) return true;
diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc
index 5de262f..6c923be 100644
--- a/test/perf_counters_gtest.cc
+++ b/test/perf_counters_gtest.cc
@@ -27,12 +27,22 @@
EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
}
+// Generic events will have as many counters as there are CPU PMUs, and each
+// will have the same name. In order to make these tests independent of the
+// number of CPU PMUs in the system, we uniquify the counter names before
+// testing them.
+static std::set<std::string> UniqueCounterNames(const PerfCounters& pc) {
+ std::set<std::string> names{pc.names().begin(), pc.names().end()};
+ return names;
+}
+
TEST(PerfCountersTest, OneCounter) {
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Performance counters not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
- EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
+ EXPECT_EQ(
+ UniqueCounterNames(PerfCounters::Create({kGenericPerfEvent1})).size(), 1);
}
TEST(PerfCountersTest, NegativeTest) {
@@ -53,48 +63,101 @@
// number of counters has to be two, not zero
auto counter =
PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
- EXPECT_EQ(counter.num_counters(), 2);
- EXPECT_EQ(counter.names(), std::vector<std::string>(
- {kGenericPerfEvent2, kGenericPerfEvent1}));
+ auto names = UniqueCounterNames(counter);
+ EXPECT_EQ(names.size(), 2);
+ EXPECT_EQ(names,
+ std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
}
{
// Try sneaking in an outrageous counter, like a fat finger mistake
auto counter = PerfCounters::Create(
{kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1});
- EXPECT_EQ(counter.num_counters(), 2);
- EXPECT_EQ(counter.names(), std::vector<std::string>(
- {kGenericPerfEvent2, kGenericPerfEvent1}));
+ auto names = UniqueCounterNames(counter);
+ EXPECT_EQ(names.size(), 2);
+ EXPECT_EQ(names,
+ std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
}
{
// Finally try a golden input - it should like both of them
- EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2})
- .num_counters(),
+ EXPECT_EQ(UniqueCounterNames(PerfCounters::Create(
+ {kGenericPerfEvent1, kGenericPerfEvent2}))
+ .size(),
2);
}
{
// Add a bad apple in the end of the chain to check the edges
auto counter = PerfCounters::Create(
{kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"});
- EXPECT_EQ(counter.num_counters(), 2);
- EXPECT_EQ(counter.names(), std::vector<std::string>(
- {kGenericPerfEvent1, kGenericPerfEvent2}));
+ auto names = UniqueCounterNames(counter);
+ EXPECT_EQ(names.size(), 2);
+ EXPECT_EQ(names,
+ std::set<std::string>({kGenericPerfEvent1, kGenericPerfEvent2}));
}
}
+static std::map<std::string, uint64_t> SnapshotAndCombine(
+ PerfCounters& counters) {
+ PerfCounterValues values(counters.num_counters());
+ std::map<std::string, uint64_t> value_map;
+
+ if (counters.Snapshot(&values)) {
+ for (size_t i = 0; i != counters.num_counters(); ++i) {
+ value_map[counters.names()[i]] += values[i];
+ }
+ }
+ return value_map;
+}
+
TEST(PerfCountersTest, Read1Counter) {
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
auto counters = PerfCounters::Create({kGenericPerfEvent1});
- EXPECT_EQ(counters.num_counters(), 1);
- PerfCounterValues values1(1);
- EXPECT_TRUE(counters.Snapshot(&values1));
- EXPECT_GT(values1[0], 0);
- PerfCounterValues values2(1);
- EXPECT_TRUE(counters.Snapshot(&values2));
- EXPECT_GT(values2[0], 0);
- EXPECT_GT(values2[0], values1[0]);
+ auto values1 = SnapshotAndCombine(counters);
+ EXPECT_EQ(values1.size(), 1);
+ EXPECT_GT(values1.begin()->second, 0);
+ auto values2 = SnapshotAndCombine(counters);
+ EXPECT_EQ(values2.size(), 1);
+ EXPECT_GT(values2.begin()->second, 0);
+ EXPECT_GT(values2.begin()->second, values1.begin()->second);
+}
+
+TEST(PerfCountersTest, Read1CounterEachCPU) {
+ if (!PerfCounters::kSupported) {
+ GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+ }
+#ifdef __linux__
+ EXPECT_TRUE(PerfCounters::Initialize());
+
+ cpu_set_t saved_set;
+ if (sched_getaffinity(0, sizeof(saved_set), &saved_set) != 0) {
+ // This can happen e.g. if there are more than CPU_SETSIZE CPUs.
+ GTEST_SKIP() << "Could not save CPU affinity mask.\n";
+ }
+
+ for (size_t cpu = 0; cpu != CPU_SETSIZE; ++cpu) {
+ cpu_set_t set;
+ CPU_ZERO(&set);
+ CPU_SET(cpu, &set);
+ if (sched_setaffinity(0, sizeof(set), &set) != 0) {
+ break;
+ }
+
+ auto counters = PerfCounters::Create({kGenericPerfEvent1});
+ auto values1 = SnapshotAndCombine(counters);
+ EXPECT_EQ(values1.size(), 1);
+ EXPECT_GT(values1.begin()->second, 0);
+ auto values2 = SnapshotAndCombine(counters);
+ EXPECT_EQ(values2.size(), 1);
+ EXPECT_GT(values2.begin()->second, 0);
+ EXPECT_GT(values2.begin()->second, values1.begin()->second);
+ }
+
+ EXPECT_EQ(sched_setaffinity(0, sizeof(saved_set), &saved_set), 0);
+#else
+ GTEST_SKIP() << "Test skipped on non-Linux.\n";
+#endif
}
TEST(PerfCountersTest, Read2Counters) {
@@ -104,15 +167,17 @@
EXPECT_TRUE(PerfCounters::Initialize());
auto counters =
PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
- EXPECT_EQ(counters.num_counters(), 2);
- PerfCounterValues values1(2);
- EXPECT_TRUE(counters.Snapshot(&values1));
- EXPECT_GT(values1[0], 0);
- EXPECT_GT(values1[1], 0);
- PerfCounterValues values2(2);
- EXPECT_TRUE(counters.Snapshot(&values2));
- EXPECT_GT(values2[0], 0);
- EXPECT_GT(values2[1], 0);
+ auto values1 = SnapshotAndCombine(counters);
+ EXPECT_EQ(values1.size(), 2);
+ for (auto& kv : values1) {
+ EXPECT_GT(kv.second, 0);
+ }
+ auto values2 = SnapshotAndCombine(counters);
+ EXPECT_EQ(values1.size(), 2);
+ for (auto& kv : values2) {
+ EXPECT_GT(kv.second, 0);
+ EXPECT_GT(kv.second, values1[kv.first]);
+ }
}
TEST(PerfCountersTest, ReopenExistingCounters) {
@@ -127,7 +192,7 @@
for (auto& counter : counters) {
counter = PerfCounters::Create(kMetrics);
}
- PerfCounterValues values(1);
+ PerfCounterValues values(counters[0].num_counters());
EXPECT_TRUE(counters[0].Snapshot(&values));
EXPECT_TRUE(counters[1].Snapshot(&values));
}
@@ -171,7 +236,8 @@
size_t max_counters = kMaxCounters;
for (size_t i = 0; i < kMaxCounters; ++i) {
auto& counter(*perf_counter_measurements[i]);
- EXPECT_EQ(counter.num_counters(), 1);
+ std::set<std::string> names{counter.names().begin(), counter.names().end()};
+ EXPECT_EQ(names.size(), 1);
if (!counter.Start()) {
max_counters = i;
break;
@@ -212,8 +278,8 @@
return sum;
}
-void measure(size_t threadcount, PerfCounterValues* before,
- PerfCounterValues* after) {
+void measure(size_t threadcount, std::map<std::string, uint64_t>* before,
+ std::map<std::string, uint64_t>* after) {
BM_CHECK_NE(before, nullptr);
BM_CHECK_NE(after, nullptr);
std::vector<std::thread> threads(threadcount);
@@ -229,11 +295,11 @@
for (auto& t : threads) {
t = std::thread(work);
}
- counters.Snapshot(before);
+ *before = SnapshotAndCombine(counters);
for (auto& t : threads) {
t.join();
}
- counters.Snapshot(after);
+ *after = SnapshotAndCombine(counters);
}
TEST(PerfCountersTest, MultiThreaded) {
@@ -241,8 +307,7 @@
GTEST_SKIP() << "Test skipped because libpfm is not supported.";
}
EXPECT_TRUE(PerfCounters::Initialize());
- PerfCounterValues before(2);
- PerfCounterValues after(2);
+ std::map<std::string, uint64_t> before, after;
// Notice that this test will work even if we taskset it to a single CPU
// In this case the threads will run sequentially
@@ -250,15 +315,19 @@
// instructions
measure(2, &before, &after);
std::vector<double> Elapsed2Threads{
- static_cast<double>(after[0] - before[0]),
- static_cast<double>(after[1] - before[1])};
+ static_cast<double>(after[kGenericPerfEvent1] -
+ before[kGenericPerfEvent1]),
+ static_cast<double>(after[kGenericPerfEvent2] -
+ before[kGenericPerfEvent2])};
// Start four threads and measure the number of combined cycles and
// instructions
measure(4, &before, &after);
std::vector<double> Elapsed4Threads{
- static_cast<double>(after[0] - before[0]),
- static_cast<double>(after[1] - before[1])};
+ static_cast<double>(after[kGenericPerfEvent1] -
+ before[kGenericPerfEvent1]),
+ static_cast<double>(after[kGenericPerfEvent2] -
+ before[kGenericPerfEvent2])};
// The following expectations fail (at least on a beefy workstation with lots
// of cpus) - it seems that in some circumstances the runtime of 4 threads