blob: c4f287921ae32c42c4c3fa6f10e2cff46e30467f [file]
#include <mutex>
#include <random>
#include <thread>
#include "../src/perf_counters.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#ifndef GTEST_SKIP
struct MsgHandler {
void operator=(std::ostream&) {}
};
#define GTEST_SKIP() return MsgHandler() = std::cout
#endif
using benchmark::internal::PerfCounters;
using benchmark::internal::PerfCountersMeasurement;
using benchmark::internal::PerfCounterValues;
using ::testing::AllOf;
using ::testing::Gt;
using ::testing::Lt;
namespace {
const char kGenericPerfEvent1[] = "CYCLES";
const char kGenericPerfEvent2[] = "INSTRUCTIONS";
TEST(PerfCountersTest, Init) {
EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
}
// Generic events will have as many counters as there are CPU PMUs, and each
// will have the same name. In order to make these tests independent of the
// number of CPU PMUs in the system, we uniquify the counter names before
// testing them.
static std::set<std::string> UniqueCounterNames(const PerfCounters& pc) {
std::set<std::string> names{pc.names().begin(), pc.names().end()};
return names;
}
TEST(PerfCountersTest, OneCounter) {
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Performance counters not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
EXPECT_EQ(
UniqueCounterNames(PerfCounters::Create({kGenericPerfEvent1})).size(), 1);
}
TEST(PerfCountersTest, NegativeTest) {
if (!PerfCounters::kSupported) {
EXPECT_FALSE(PerfCounters::Initialize());
return;
}
EXPECT_TRUE(PerfCounters::Initialize());
// Safety checks
// Create() will always create a valid object, even if passed no or
// wrong arguments as the new behavior is to warn and drop unsupported
// counters
EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
{
// Try sneaking in a bad egg to see if it is filtered out. The
// number of counters has to be two, not zero
auto counter =
PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
auto names = UniqueCounterNames(counter);
EXPECT_EQ(names.size(), 2);
EXPECT_EQ(names,
std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
}
{
// Try sneaking in an outrageous counter, like a fat finger mistake
auto counter = PerfCounters::Create(
{kGenericPerfEvent2, "not a counter name", kGenericPerfEvent1});
auto names = UniqueCounterNames(counter);
EXPECT_EQ(names.size(), 2);
EXPECT_EQ(names,
std::set<std::string>({kGenericPerfEvent2, kGenericPerfEvent1}));
}
{
// Finally try a golden input - it should like both of them
EXPECT_EQ(UniqueCounterNames(PerfCounters::Create(
{kGenericPerfEvent1, kGenericPerfEvent2}))
.size(),
2);
}
{
// Add a bad apple in the end of the chain to check the edges
auto counter = PerfCounters::Create(
{kGenericPerfEvent1, kGenericPerfEvent2, "bad event name"});
auto names = UniqueCounterNames(counter);
EXPECT_EQ(names.size(), 2);
EXPECT_EQ(names,
std::set<std::string>({kGenericPerfEvent1, kGenericPerfEvent2}));
}
}
static std::map<std::string, uint64_t> SnapshotAndCombine(
PerfCounters& counters) {
PerfCounterValues values(counters.num_counters());
std::map<std::string, uint64_t> value_map;
if (counters.Snapshot(&values)) {
for (size_t i = 0; i != counters.num_counters(); ++i) {
value_map[counters.names()[i]] += values[i];
}
}
return value_map;
}
TEST(PerfCountersTest, Read1Counter) {
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
auto counters = PerfCounters::Create({kGenericPerfEvent1});
auto values1 = SnapshotAndCombine(counters);
EXPECT_EQ(values1.size(), 1);
EXPECT_GT(values1.begin()->second, 0);
auto values2 = SnapshotAndCombine(counters);
EXPECT_EQ(values2.size(), 1);
EXPECT_GT(values2.begin()->second, 0);
EXPECT_GT(values2.begin()->second, values1.begin()->second);
}
TEST(PerfCountersTest, Read1CounterEachCPU) {
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
#ifdef __linux__
EXPECT_TRUE(PerfCounters::Initialize());
cpu_set_t saved_set;
if (sched_getaffinity(0, sizeof(saved_set), &saved_set) != 0) {
// This can happen e.g. if there are more than CPU_SETSIZE CPUs.
GTEST_SKIP() << "Could not save CPU affinity mask.\n";
}
for (size_t cpu = 0; cpu != CPU_SETSIZE; ++cpu) {
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(cpu, &set);
if (sched_setaffinity(0, sizeof(set), &set) != 0) {
break;
}
auto counters = PerfCounters::Create({kGenericPerfEvent1});
auto values1 = SnapshotAndCombine(counters);
EXPECT_EQ(values1.size(), 1);
EXPECT_GT(values1.begin()->second, 0);
auto values2 = SnapshotAndCombine(counters);
EXPECT_EQ(values2.size(), 1);
EXPECT_GT(values2.begin()->second, 0);
EXPECT_GT(values2.begin()->second, values1.begin()->second);
}
EXPECT_EQ(sched_setaffinity(0, sizeof(saved_set), &saved_set), 0);
#else
GTEST_SKIP() << "Test skipped on non-Linux.\n";
#endif
}
TEST(PerfCountersTest, Read2Counters) {
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
auto counters =
PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
auto values1 = SnapshotAndCombine(counters);
EXPECT_EQ(values1.size(), 2);
for (auto& kv : values1) {
EXPECT_GT(kv.second, 0);
}
auto values2 = SnapshotAndCombine(counters);
EXPECT_EQ(values1.size(), 2);
for (auto& kv : values2) {
EXPECT_GT(kv.second, 0);
EXPECT_GT(kv.second, values1[kv.first]);
}
}
TEST(PerfCountersTest, ReopenExistingCounters) {
// This test works in recent and old Intel hardware, Pixel 3, and Pixel 6.
// However we cannot make assumptions beyond 2 HW counters due to Pixel 6.
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
std::vector<std::string> kMetrics({kGenericPerfEvent1});
std::vector<PerfCounters> counters(2);
for (auto& counter : counters) {
counter = PerfCounters::Create(kMetrics);
}
PerfCounterValues values(counters[0].num_counters());
EXPECT_TRUE(counters[0].Snapshot(&values));
EXPECT_TRUE(counters[1].Snapshot(&values));
}
TEST(PerfCountersTest, CreateExistingMeasurements) {
// The test works (i.e. causes read to fail) for the assumptions
// about hardware capabilities (i.e. small number (2) hardware
// counters) at this date,
// the same as previous test ReopenExistingCounters.
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
// This means we will try 10 counters but we can only guarantee
// for sure at this time that only 3 will work. Perhaps in the future
// we could use libpfm to query for the hardware limits on this
// particular platform.
const int kMaxCounters = 10;
const int kMinValidCounters = 2;
// Let's use a ubiquitous counter that is guaranteed to work
// on all platforms
const std::vector<std::string> kMetrics{"cycles"};
// Cannot create a vector of actual objects because the
// copy constructor of PerfCounters is deleted - and so is
// implicitly deleted on PerfCountersMeasurement too
std::vector<std::unique_ptr<PerfCountersMeasurement>>
perf_counter_measurements;
perf_counter_measurements.reserve(kMaxCounters);
for (int j = 0; j < kMaxCounters; ++j) {
perf_counter_measurements.emplace_back(
new PerfCountersMeasurement(kMetrics));
}
std::vector<std::pair<std::string, double>> measurements;
// Start all counters together to see if they hold
size_t max_counters = kMaxCounters;
for (size_t i = 0; i < kMaxCounters; ++i) {
auto& counter(*perf_counter_measurements[i]);
std::set<std::string> names{counter.names().begin(), counter.names().end()};
EXPECT_EQ(names.size(), 1);
if (!counter.Start()) {
max_counters = i;
break;
};
}
ASSERT_GE(max_counters, kMinValidCounters);
// Start all together
for (size_t i = 0; i < max_counters; ++i) {
auto& counter(*perf_counter_measurements[i]);
EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
}
// Start/stop individually
for (size_t i = 0; i < max_counters; ++i) {
auto& counter(*perf_counter_measurements[i]);
measurements.clear();
counter.Start();
EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
}
}
// We try to do some meaningful work here but the compiler
// insists in optimizing away our loop so we had to add a
// no-optimize macro. In case it fails, we added some entropy
// to this pool as well.
BENCHMARK_DONT_OPTIMIZE size_t do_work() {
static std::mt19937 rd{std::random_device{}()};
static std::uniform_int_distribution<size_t> mrand(0, 10);
const size_t kNumLoops = 1000000;
size_t sum = 0;
for (size_t j = 0; j < kNumLoops; ++j) {
sum += mrand(rd);
}
benchmark::DoNotOptimize(sum);
return sum;
}
void measure(size_t threadcount, std::map<std::string, uint64_t>* before,
std::map<std::string, uint64_t>* after) {
BM_CHECK_NE(before, nullptr);
BM_CHECK_NE(after, nullptr);
std::vector<std::thread> threads(threadcount);
// Because we do not care whether the threads execute concurrently, but we do
// care that they do all of their work between the SnapshotAndCombine calls,
// we serialize them with a mutex. See
// https://github.com/google/benchmark/issues/2173.
std::mutex mutex;
auto work = [&mutex]() {
mutex.lock();
BM_CHECK(do_work() > 1000);
mutex.unlock();
};
// We need to first set up the counters, then start the threads, so the
// threads would inherit the counters. But later, we need to first destroy
// the thread pool (so all the work finishes), then measure the counters. So
// the scopes overlap, and we need to explicitly control the scope of the
// threadpool.
auto counters =
PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
mutex.lock();
for (auto& t : threads) {
t = std::thread(work);
}
*before = SnapshotAndCombine(counters);
mutex.unlock();
for (auto& t : threads) {
t.join();
}
*after = SnapshotAndCombine(counters);
}
TEST(PerfCountersTest, MultiThreaded) {
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.";
}
EXPECT_TRUE(PerfCounters::Initialize());
std::map<std::string, uint64_t> before, after;
// Notice that this test will work even if we taskset it to a single CPU
// In this case the threads will run sequentially
// Start two threads and measure the number of combined cycles and
// instructions
measure(2, &before, &after);
std::vector<double> Elapsed2Threads{
static_cast<double>(after[kGenericPerfEvent1] -
before[kGenericPerfEvent1]),
static_cast<double>(after[kGenericPerfEvent2] -
before[kGenericPerfEvent2])};
// Start four threads and measure the number of combined cycles and
// instructions
measure(4, &before, &after);
std::vector<double> Elapsed4Threads{
static_cast<double>(after[kGenericPerfEvent1] -
before[kGenericPerfEvent1]),
static_cast<double>(after[kGenericPerfEvent2] -
before[kGenericPerfEvent2])};
// The following expectations fail (at least on a beefy workstation with lots
// of cpus) - it seems that in some circumstances the runtime of 4 threads
// can even be better than with 2.
// So instead of expecting 4 threads to be slower, let's just make sure they
// do not differ too much in general (one is not more than 10x than the
// other).
EXPECT_THAT(Elapsed4Threads[0] / Elapsed2Threads[0], AllOf(Gt(0.1), Lt(10)));
EXPECT_THAT(Elapsed4Threads[1] / Elapsed2Threads[1], AllOf(Gt(0.1), Lt(10)));
}
TEST(PerfCountersTest, HardwareLimits) {
// The test works (i.e. causes read to fail) for the assumptions
// about hardware capabilities (i.e. small number (3-4) hardware
// counters) at this date,
// the same as previous test ReopenExistingCounters.
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
// Taken from `perf list`, but focusses only on those HW events that actually
// were reported when running `sudo perf stat -a sleep 10`, intersected over
// several platforms. All HW events listed in the first command not reported
// in the second seem to not work. This is sad as we don't really get to test
// the grouping here (groups can contain up to 6 members)...
std::vector<std::string> counter_names{
"cycles", // leader
"instructions", //
"branch-misses", //
};
// In the off-chance that some of these values are not supported,
// we filter them out so the test will complete without failure
// albeit it might not actually test the grouping on that platform
std::vector<std::string> valid_names;
for (const std::string& name : counter_names) {
if (PerfCounters::IsCounterSupported(name)) {
valid_names.push_back(name);
}
}
PerfCountersMeasurement counter(valid_names);
std::vector<std::pair<std::string, double>> measurements;
counter.Start();
EXPECT_TRUE(counter.Stop(measurements));
}
} // namespace