test/perf_counters_gtest.cc - third_party/github/google/benchmark - Git at Google

 #include <mutex>
 #include <random>
 #include <thread>

 #include "../src/perf_counters.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"

 #ifndef GTEST_SKIP
 struct MsgHandler {
   void operator=(std::ostream&) {}
 };
 #define GTEST_SKIP() return MsgHandler() = std::cout
 #endif

 using benchmark::internal::PerfCounters;
 using benchmark::internal::PerfCountersMeasurement;
 using benchmark::internal::PerfCounterValues;
 using ::testing::AllOf;
 using ::testing::Gt;
 using ::testing::Lt;

 namespace {
 const char kGenericPerfEvent1[] = "CYCLES";
 const char kGenericPerfEvent2[] = "INSTRUCTIONS";

 TEST(PerfCountersTest, Init) {
   EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
 }

 // Generic events will have as many counters as there are CPU PMUs, and each
 // will have the same name. In order to make these tests independent of the
 // number of CPU PMUs in the system, we uniquify the counter names before
 // testing them.
 static std::set<std::string> UniqueCounterNames(const PerfCounters& pc) {
   std::set<std::string> names{pc.names().begin(), pc.names().end()};
   return names;
 }

 TEST(PerfCountersTest, OneCounter) {
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Performance counters not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   EXPECT_EQ(
       UniqueCounterNames(PerfCounters::Create({kGenericPerfEvent1})).size(), 1);
 }

 TEST(PerfCountersTest, NegativeTest) {
   if (!PerfCounters::kSupported) {
     EXPECT_FALSE(PerfCounters::Initialize());
     return;
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   // Safety checks
   // Create() will always create a valid object, even if passed no or
   // wrong arguments as the new behavior is to warn and drop unsupported
   // counters
   EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
   EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
   EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
   {
     // Try sneaking in a bad egg to see if it is filtered out. The
     // number of counters has to be two, not zero
     auto counter =
         PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
     auto names = UniqueCounterNames(counter);
     EXPECT_EQ(names.size(), 2);
     EXPECT_EQ(names,
               std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
   }
   {
     // Try sneaking in an outrageous counter, like a fat finger mistake
     auto counter = PerfCounters::Create(
         {kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1});
     auto names = UniqueCounterNames(counter);
     EXPECT_EQ(names.size(), 2);
     EXPECT_EQ(names,
               std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
   }
   {
     // Finally try a golden input - it should like both of them
     EXPECT_EQ(UniqueCounterNames(PerfCounters::Create(
                                      {kGenericPerfEvent1, kGenericPerfEvent2}))
                   .size(),
               2);
   }
   {
     // Add a bad apple in the end of the chain to check the edges
     auto counter = PerfCounters::Create(
         {kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"});
     auto names = UniqueCounterNames(counter);
     EXPECT_EQ(names.size(), 2);
     EXPECT_EQ(names,
               std::set<std::string>({kGenericPerfEvent1, kGenericPerfEvent2}));
   }
 }

 static std::map<std::string, uint64_t> SnapshotAndCombine(
     PerfCounters& counters) {
   PerfCounterValues values(counters.num_counters());
   std::map<std::string, uint64_t> value_map;

   if (counters.Snapshot(&values)) {
     for (size_t i = 0; i != counters.num_counters(); ++i) {
       value_map[counters.names()[i]] += values[i];
     }
   }
   return value_map;
 }

 TEST(PerfCountersTest, Read1Counter) {
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   auto counters = PerfCounters::Create({kGenericPerfEvent1});
   auto values1 = SnapshotAndCombine(counters);
   EXPECT_EQ(values1.size(), 1);
   EXPECT_GT(values1.begin()->second, 0);
   auto values2 = SnapshotAndCombine(counters);
   EXPECT_EQ(values2.size(), 1);
   EXPECT_GT(values2.begin()->second, 0);
   EXPECT_GT(values2.begin()->second, values1.begin()->second);
 }

 TEST(PerfCountersTest, Read1CounterEachCPU) {
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
   }
 #ifdef __linux__
   EXPECT_TRUE(PerfCounters::Initialize());

   cpu_set_t saved_set;
   if (sched_getaffinity(0, sizeof(saved_set), &saved_set) != 0) {
     // This can happen e.g. if there are more than CPU_SETSIZE CPUs.
     GTEST_SKIP() << "Could not save CPU affinity mask.\n";
   }

   for (size_t cpu = 0; cpu != CPU_SETSIZE; ++cpu) {
     cpu_set_t set;
     CPU_ZERO(&set);
     CPU_SET(cpu, &set);
     if (sched_setaffinity(0, sizeof(set), &set) != 0) {
       break;
     }

     auto counters = PerfCounters::Create({kGenericPerfEvent1});
     auto values1 = SnapshotAndCombine(counters);
     EXPECT_EQ(values1.size(), 1);
     EXPECT_GT(values1.begin()->second, 0);
     auto values2 = SnapshotAndCombine(counters);
     EXPECT_EQ(values2.size(), 1);
     EXPECT_GT(values2.begin()->second, 0);
     EXPECT_GT(values2.begin()->second, values1.begin()->second);
   }

   EXPECT_EQ(sched_setaffinity(0, sizeof(saved_set), &saved_set), 0);
 #else
   GTEST_SKIP() << "Test skipped on non-Linux.\n";
 #endif
 }

 TEST(PerfCountersTest, Read2Counters) {
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   auto counters =
       PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
   auto values1 = SnapshotAndCombine(counters);
   EXPECT_EQ(values1.size(), 2);
   for (auto& kv : values1) {
     EXPECT_GT(kv.second, 0);
   }
   auto values2 = SnapshotAndCombine(counters);
   EXPECT_EQ(values1.size(), 2);
   for (auto& kv : values2) {
     EXPECT_GT(kv.second, 0);
     EXPECT_GT(kv.second, values1[kv.first]);
   }
 }

 TEST(PerfCountersTest, ReopenExistingCounters) {
   // This test works in recent and old Intel hardware, Pixel 3, and Pixel 6.
   // However we cannot make assumptions beyond 2 HW counters due to Pixel 6.
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   std::vector<std::string> kMetrics({kGenericPerfEvent1});
   std::vector<PerfCounters> counters(2);
   for (auto& counter : counters) {
     counter = PerfCounters::Create(kMetrics);
   }
   PerfCounterValues values(counters[0].num_counters());
   EXPECT_TRUE(counters[0].Snapshot(&values));
   EXPECT_TRUE(counters[1].Snapshot(&values));
 }

 TEST(PerfCountersTest, CreateExistingMeasurements) {
   // The test works (i.e. causes read to fail) for the assumptions
   // about hardware capabilities (i.e. small number (2) hardware
   // counters) at this date,
   // the same as previous test ReopenExistingCounters.
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());

   // This means we will try 10 counters but we can only guarantee
   // for sure at this time that only 3 will work. Perhaps in the future
   // we could use libpfm to query for the hardware limits on this
   // particular platform.
   const int kMaxCounters = 10;
   const int kMinValidCounters = 2;

   // Let's use a ubiquitous counter that is guaranteed to work
   // on all platforms
   const std::vector<std::string> kMetrics{"cycles"};

   // Cannot create a vector of actual objects because the
   // copy constructor of PerfCounters is deleted - and so is
   // implicitly deleted on PerfCountersMeasurement too
   std::vector<std::unique_ptr<PerfCountersMeasurement>>
       perf_counter_measurements;

   perf_counter_measurements.reserve(kMaxCounters);
   for (int j = 0; j < kMaxCounters; ++j) {
     perf_counter_measurements.emplace_back(
         new PerfCountersMeasurement(kMetrics));
   }

   std::vector<std::pair<std::string, double>> measurements;

   // Start all counters together to see if they hold
   size_t max_counters = kMaxCounters;
   for (size_t i = 0; i < kMaxCounters; ++i) {
     auto& counter(*perf_counter_measurements[i]);
     std::set<std::string> names{counter.names().begin(), counter.names().end()};
     EXPECT_EQ(names.size(), 1);
     if (!counter.Start()) {
       max_counters = i;
       break;
     };
   }

   ASSERT_GE(max_counters, kMinValidCounters);

   // Start all together
   for (size_t i = 0; i < max_counters; ++i) {
     auto& counter(*perf_counter_measurements[i]);
     EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
   }

   // Start/stop individually
   for (size_t i = 0; i < max_counters; ++i) {
     auto& counter(*perf_counter_measurements[i]);
     measurements.clear();
     counter.Start();
     EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
   }
 }

 // We try to do some meaningful work here but the compiler
 // insists in optimizing away our loop so we had to add a
 // no-optimize macro. In case it fails, we added some entropy
 // to this pool as well.

 BENCHMARK_DONT_OPTIMIZE size_t do_work() {
   static std::mt19937 rd{std::random_device{}()};
   static std::uniform_int_distribution<size_t> mrand(0, 10);
   const size_t kNumLoops = 1000000;
   size_t sum = 0;
   for (size_t j = 0; j < kNumLoops; ++j) {
     sum += mrand(rd);
   }
   benchmark::DoNotOptimize(sum);
   return sum;
 }

 void measure(size_t threadcount, std::map<std::string, uint64_t>* before,
              std::map<std::string, uint64_t>* after) {
   BM_CHECK_NE(before, nullptr);
   BM_CHECK_NE(after, nullptr);
   std::vector<std::thread> threads(threadcount);
   // Because we do not care whether the threads execute concurrently, but we do
   // care that they do all of their work between the SnapshotAndCombine calls,
   // we serialize them with a mutex. See
   // https://github.com/google/benchmark/issues/2173.
   std::mutex mutex;
   auto work = [&mutex]() {
     mutex.lock();
     BM_CHECK(do_work() > 1000);
     mutex.unlock();
   };

   // We need to first set up the counters, then start the threads, so the
   // threads would inherit the counters. But later, we need to first destroy
   // the thread pool (so all the work finishes), then measure the counters. So
   // the scopes overlap, and we need to explicitly control the scope of the
   // threadpool.
   auto counters =
       PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
   mutex.lock();
   for (auto& t : threads) {
     t = std::thread(work);
   }
   *before = SnapshotAndCombine(counters);
   mutex.unlock();
   for (auto& t : threads) {
     t.join();
   }
   *after = SnapshotAndCombine(counters);
 }

 TEST(PerfCountersTest, MultiThreaded) {
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.";
   }
   EXPECT_TRUE(PerfCounters::Initialize());
   std::map<std::string, uint64_t> before, after;

   // Notice that this test will work even if we taskset it to a single CPU
   // In this case the threads will run sequentially
   // Start two threads and measure the number of combined cycles and
   // instructions
   measure(2, &before, &after);
   std::vector<double> Elapsed2Threads{
       static_cast<double>(after[kGenericPerfEvent1] -
                           before[kGenericPerfEvent1]),
       static_cast<double>(after[kGenericPerfEvent2] -
                           before[kGenericPerfEvent2])};

   // Start four threads and measure the number of combined cycles and
   // instructions
   measure(4, &before, &after);
   std::vector<double> Elapsed4Threads{
       static_cast<double>(after[kGenericPerfEvent1] -
                           before[kGenericPerfEvent1]),
       static_cast<double>(after[kGenericPerfEvent2] -
                           before[kGenericPerfEvent2])};

   // The following expectations fail (at least on a beefy workstation with lots
   // of cpus) - it seems that in some circumstances the runtime of 4 threads
   // can even be better than with 2.
   // So instead of expecting 4 threads to be slower, let's just make sure they
   // do not differ too much in general (one is not more than 10x than the
   // other).
   EXPECT_THAT(Elapsed4Threads[0] / Elapsed2Threads[0], AllOf(Gt(0.1), Lt(10)));
   EXPECT_THAT(Elapsed4Threads[1] / Elapsed2Threads[1], AllOf(Gt(0.1), Lt(10)));
 }

 TEST(PerfCountersTest, HardwareLimits) {
   // The test works (i.e. causes read to fail) for the assumptions
   // about hardware capabilities (i.e. small number (3-4) hardware
   // counters) at this date,
   // the same as previous test ReopenExistingCounters.
   if (!PerfCounters::kSupported) {
     GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
   }
   EXPECT_TRUE(PerfCounters::Initialize());

   // Taken from `perf list`, but focusses only on those HW events that actually
   // were reported when running `sudo perf stat -a sleep 10`, intersected over
   // several platforms. All HW events listed in the first command not reported
   // in the second seem to not work. This is sad as we don't really get to test
   // the grouping here (groups can contain up to 6 members)...
   std::vector<std::string> counter_names{
       "cycles",         // leader
       "instructions",   //
       "branch-misses",  //
   };

   // In the off-chance that some of these values are not supported,
   // we filter them out so the test will complete without failure
   // albeit it might not actually test the grouping on that platform
   std::vector<std::string> valid_names;
   for (const std::string& name : counter_names) {
     if (PerfCounters::IsCounterSupported(name)) {
       valid_names.push_back(name);
     }
   }
   PerfCountersMeasurement counter(valid_names);

   std::vector<std::pair<std::string, double>> measurements;

   counter.Start();
   EXPECT_TRUE(counter.Stop(measurements));
 }

 }  // namespace
	#include <mutex>
	#include <random>
	#include <thread>

	#include "../src/perf_counters.h"
	#include "gmock/gmock.h"
	#include "gtest/gtest.h"

	#ifndef GTEST_SKIP
	struct MsgHandler {
	void operator=(std::ostream&) {}
	};
	#define GTEST_SKIP() return MsgHandler() = std::cout
	#endif

	using benchmark::internal::PerfCounters;
	using benchmark::internal::PerfCountersMeasurement;
	using benchmark::internal::PerfCounterValues;
	using ::testing::AllOf;
	using ::testing::Gt;
	using ::testing::Lt;

	namespace {
	const char kGenericPerfEvent1[] = "CYCLES";
	const char kGenericPerfEvent2[] = "INSTRUCTIONS";

	TEST(PerfCountersTest, Init) {
	EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
	}

	// Generic events will have as many counters as there are CPU PMUs, and each
	// will have the same name. In order to make these tests independent of the
	// number of CPU PMUs in the system, we uniquify the counter names before
	// testing them.
	static std::set<std::string> UniqueCounterNames(const PerfCounters& pc) {
	std::set<std::string> names{pc.names().begin(), pc.names().end()};
	return names;
	}

	TEST(PerfCountersTest, OneCounter) {
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Performance counters not supported.\n";
	}
	EXPECT_TRUE(PerfCounters::Initialize());
	EXPECT_EQ(
	UniqueCounterNames(PerfCounters::Create({kGenericPerfEvent1})).size(), 1);
	}

	TEST(PerfCountersTest, NegativeTest) {
	if (!PerfCounters::kSupported) {
	EXPECT_FALSE(PerfCounters::Initialize());
	return;
	}
	EXPECT_TRUE(PerfCounters::Initialize());
	// Safety checks
	// Create() will always create a valid object, even if passed no or
	// wrong arguments as the new behavior is to warn and drop unsupported
	// counters
	EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
	EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
	EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
	{
	// Try sneaking in a bad egg to see if it is filtered out. The
	// number of counters has to be two, not zero
	auto counter =
	PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
	auto names = UniqueCounterNames(counter);
	EXPECT_EQ(names.size(), 2);
	EXPECT_EQ(names,
	std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
	}
	{
	// Try sneaking in an outrageous counter, like a fat finger mistake
	auto counter = PerfCounters::Create(
	{kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1});
	auto names = UniqueCounterNames(counter);
	EXPECT_EQ(names.size(), 2);
	EXPECT_EQ(names,
	std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
	}
	{
	// Finally try a golden input - it should like both of them
	EXPECT_EQ(UniqueCounterNames(PerfCounters::Create(
	{kGenericPerfEvent1, kGenericPerfEvent2}))
	.size(),
	2);
	}
	{
	// Add a bad apple in the end of the chain to check the edges
	auto counter = PerfCounters::Create(
	{kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"});
	auto names = UniqueCounterNames(counter);
	EXPECT_EQ(names.size(), 2);
	EXPECT_EQ(names,
	std::set<std::string>({kGenericPerfEvent1, kGenericPerfEvent2}));
	}
	}

	static std::map<std::string, uint64_t> SnapshotAndCombine(
	PerfCounters& counters) {
	PerfCounterValues values(counters.num_counters());
	std::map<std::string, uint64_t> value_map;

	if (counters.Snapshot(&values)) {
	for (size_t i = 0; i != counters.num_counters(); ++i) {
	value_map[counters.names()[i]] += values[i];
	}
	}
	return value_map;
	}

	TEST(PerfCountersTest, Read1Counter) {
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
	}
	EXPECT_TRUE(PerfCounters::Initialize());
	auto counters = PerfCounters::Create({kGenericPerfEvent1});
	auto values1 = SnapshotAndCombine(counters);
	EXPECT_EQ(values1.size(), 1);
	EXPECT_GT(values1.begin()->second, 0);
	auto values2 = SnapshotAndCombine(counters);
	EXPECT_EQ(values2.size(), 1);
	EXPECT_GT(values2.begin()->second, 0);
	EXPECT_GT(values2.begin()->second, values1.begin()->second);
	}

	TEST(PerfCountersTest, Read1CounterEachCPU) {
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
	}
	#ifdef __linux__
	EXPECT_TRUE(PerfCounters::Initialize());

	cpu_set_t saved_set;
	if (sched_getaffinity(0, sizeof(saved_set), &saved_set) != 0) {
	// This can happen e.g. if there are more than CPU_SETSIZE CPUs.
	GTEST_SKIP() << "Could not save CPU affinity mask.\n";
	}

	for (size_t cpu = 0; cpu != CPU_SETSIZE; ++cpu) {
	cpu_set_t set;
	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	if (sched_setaffinity(0, sizeof(set), &set) != 0) {
	break;
	}

	auto counters = PerfCounters::Create({kGenericPerfEvent1});
	auto values1 = SnapshotAndCombine(counters);
	EXPECT_EQ(values1.size(), 1);
	EXPECT_GT(values1.begin()->second, 0);
	auto values2 = SnapshotAndCombine(counters);
	EXPECT_EQ(values2.size(), 1);
	EXPECT_GT(values2.begin()->second, 0);
	EXPECT_GT(values2.begin()->second, values1.begin()->second);
	}

	EXPECT_EQ(sched_setaffinity(0, sizeof(saved_set), &saved_set), 0);
	#else
	GTEST_SKIP() << "Test skipped on non-Linux.\n";
	#endif
	}

	TEST(PerfCountersTest, Read2Counters) {
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
	}
	EXPECT_TRUE(PerfCounters::Initialize());
	auto counters =
	PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
	auto values1 = SnapshotAndCombine(counters);
	EXPECT_EQ(values1.size(), 2);
	for (auto& kv : values1) {
	EXPECT_GT(kv.second, 0);
	}
	auto values2 = SnapshotAndCombine(counters);
	EXPECT_EQ(values1.size(), 2);
	for (auto& kv : values2) {
	EXPECT_GT(kv.second, 0);
	EXPECT_GT(kv.second, values1[kv.first]);
	}
	}

	TEST(PerfCountersTest, ReopenExistingCounters) {
	// This test works in recent and old Intel hardware, Pixel 3, and Pixel 6.
	// However we cannot make assumptions beyond 2 HW counters due to Pixel 6.
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
	}
	EXPECT_TRUE(PerfCounters::Initialize());
	std::vector<std::string> kMetrics({kGenericPerfEvent1});
	std::vector<PerfCounters> counters(2);
	for (auto& counter : counters) {
	counter = PerfCounters::Create(kMetrics);
	}
	PerfCounterValues values(counters[0].num_counters());
	EXPECT_TRUE(counters[0].Snapshot(&values));
	EXPECT_TRUE(counters[1].Snapshot(&values));
	}

	TEST(PerfCountersTest, CreateExistingMeasurements) {
	// The test works (i.e. causes read to fail) for the assumptions
	// about hardware capabilities (i.e. small number (2) hardware
	// counters) at this date,
	// the same as previous test ReopenExistingCounters.
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
	}
	EXPECT_TRUE(PerfCounters::Initialize());

	// This means we will try 10 counters but we can only guarantee
	// for sure at this time that only 3 will work. Perhaps in the future
	// we could use libpfm to query for the hardware limits on this
	// particular platform.
	const int kMaxCounters = 10;
	const int kMinValidCounters = 2;

	// Let's use a ubiquitous counter that is guaranteed to work
	// on all platforms
	const std::vector<std::string> kMetrics{"cycles"};

	// Cannot create a vector of actual objects because the
	// copy constructor of PerfCounters is deleted - and so is
	// implicitly deleted on PerfCountersMeasurement too
	std::vector<std::unique_ptr<PerfCountersMeasurement>>
	perf_counter_measurements;

	perf_counter_measurements.reserve(kMaxCounters);
	for (int j = 0; j < kMaxCounters; ++j) {
	perf_counter_measurements.emplace_back(
	new PerfCountersMeasurement(kMetrics));
	}

	std::vector<std::pair<std::string, double>> measurements;

	// Start all counters together to see if they hold
	size_t max_counters = kMaxCounters;
	for (size_t i = 0; i < kMaxCounters; ++i) {
	auto& counter(*perf_counter_measurements[i]);
	std::set<std::string> names{counter.names().begin(), counter.names().end()};
	EXPECT_EQ(names.size(), 1);
	if (!counter.Start()) {
	max_counters = i;
	break;
	};
	}

	ASSERT_GE(max_counters, kMinValidCounters);

	// Start all together
	for (size_t i = 0; i < max_counters; ++i) {
	auto& counter(*perf_counter_measurements[i]);
	EXPECT_TRUE(counter.Stop(measurements) \|\| (i >= kMinValidCounters));
	}

	// Start/stop individually
	for (size_t i = 0; i < max_counters; ++i) {
	auto& counter(*perf_counter_measurements[i]);
	measurements.clear();
	counter.Start();
	EXPECT_TRUE(counter.Stop(measurements) \|\| (i >= kMinValidCounters));
	}
	}

	// We try to do some meaningful work here but the compiler
	// insists in optimizing away our loop so we had to add a
	// no-optimize macro. In case it fails, we added some entropy
	// to this pool as well.

	BENCHMARK_DONT_OPTIMIZE size_t do_work() {
	static std::mt19937 rd{std::random_device{}()};
	static std::uniform_int_distribution<size_t> mrand(0, 10);
	const size_t kNumLoops = 1000000;
	size_t sum = 0;
	for (size_t j = 0; j < kNumLoops; ++j) {
	sum += mrand(rd);
	}
	benchmark::DoNotOptimize(sum);
	return sum;
	}

	void measure(size_t threadcount, std::map<std::string, uint64_t>* before,
	std::map<std::string, uint64_t>* after) {
	BM_CHECK_NE(before, nullptr);
	BM_CHECK_NE(after, nullptr);
	std::vector<std::thread> threads(threadcount);
	// Because we do not care whether the threads execute concurrently, but we do
	// care that they do all of their work between the SnapshotAndCombine calls,
	// we serialize them with a mutex. See
	// https://github.com/google/benchmark/issues/2173.
	std::mutex mutex;
	auto work = [&mutex]() {
	mutex.lock();
	BM_CHECK(do_work() > 1000);
	mutex.unlock();
	};

	// We need to first set up the counters, then start the threads, so the
	// threads would inherit the counters. But later, we need to first destroy
	// the thread pool (so all the work finishes), then measure the counters. So
	// the scopes overlap, and we need to explicitly control the scope of the
	// threadpool.
	auto counters =
	PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
	mutex.lock();
	for (auto& t : threads) {
	t = std::thread(work);
	}
	*before = SnapshotAndCombine(counters);
	mutex.unlock();
	for (auto& t : threads) {
	t.join();
	}
	*after = SnapshotAndCombine(counters);
	}

	TEST(PerfCountersTest, MultiThreaded) {
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Test skipped because libpfm is not supported.";
	}
	EXPECT_TRUE(PerfCounters::Initialize());
	std::map<std::string, uint64_t> before, after;

	// Notice that this test will work even if we taskset it to a single CPU
	// In this case the threads will run sequentially
	// Start two threads and measure the number of combined cycles and
	// instructions
	measure(2, &before, &after);
	std::vector<double> Elapsed2Threads{
	static_cast<double>(after[kGenericPerfEvent1] -
	before[kGenericPerfEvent1]),
	static_cast<double>(after[kGenericPerfEvent2] -
	before[kGenericPerfEvent2])};

	// Start four threads and measure the number of combined cycles and
	// instructions
	measure(4, &before, &after);
	std::vector<double> Elapsed4Threads{
	static_cast<double>(after[kGenericPerfEvent1] -
	before[kGenericPerfEvent1]),
	static_cast<double>(after[kGenericPerfEvent2] -
	before[kGenericPerfEvent2])};

	// The following expectations fail (at least on a beefy workstation with lots
	// of cpus) - it seems that in some circumstances the runtime of 4 threads
	// can even be better than with 2.
	// So instead of expecting 4 threads to be slower, let's just make sure they
	// do not differ too much in general (one is not more than 10x than the
	// other).
	EXPECT_THAT(Elapsed4Threads[0] / Elapsed2Threads[0], AllOf(Gt(0.1), Lt(10)));
	EXPECT_THAT(Elapsed4Threads[1] / Elapsed2Threads[1], AllOf(Gt(0.1), Lt(10)));
	}

	TEST(PerfCountersTest, HardwareLimits) {
	// The test works (i.e. causes read to fail) for the assumptions
	// about hardware capabilities (i.e. small number (3-4) hardware
	// counters) at this date,
	// the same as previous test ReopenExistingCounters.
	if (!PerfCounters::kSupported) {
	GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
	}
	EXPECT_TRUE(PerfCounters::Initialize());

	// Taken from `perf list`, but focusses only on those HW events that actually
	// were reported when running `sudo perf stat -a sleep 10`, intersected over
	// several platforms. All HW events listed in the first command not reported
	// in the second seem to not work. This is sad as we don't really get to test
	// the grouping here (groups can contain up to 6 members)...
	std::vector<std::string> counter_names{
	"cycles", // leader
	"instructions", //
	"branch-misses", //
	};

	// In the off-chance that some of these values are not supported,
	// we filter them out so the test will complete without failure
	// albeit it might not actually test the grouping on that platform
	std::vector<std::string> valid_names;
	for (const std::string& name : counter_names) {
	if (PerfCounters::IsCounterSupported(name)) {
	valid_names.push_back(name);
	}
	}
	PerfCountersMeasurement counter(valid_names);

	std::vector<std::pair<std::string, double>> measurements;

	counter.Start();
	EXPECT_TRUE(counter.Stop(measurements));
	}

	} // namespace