blob: bb74d1040833e5d23b047b9dcfff9c5d75de3177 [file]
#ifndef HIGHWAYHASH_HIGHWAYHASH_NANOBENCHMARK_H_
#define HIGHWAYHASH_HIGHWAYHASH_NANOBENCHMARK_H_
// Benchmarks functions of a single integer argument with realistic branch
// prediction hit rates. Uses a robust estimator to summarize the measurements.
// Measurements are precise to about 0.2 cycles.
//
// Example:
// #include "third_party/highwayhash/highwayhash/nanobenchmark.h"
// nanobenchmark::RaiseThreadPriority();
// nanobenchmark::PinThreadToCPU();
// const std::map<size_t, float> durations =
// nanobenchmark::MeasureWithArguments({3, 4, 7, 8}, [](const size_t size) {
// char from[8] = {static_cast<char>(size)};
// char to[8];
// memcpy(to, from, size);
// return to[0];
// });
// printf("Cycles for input = 3: %4.1f\n", durations[3]);
// printf("Cycles for input = 7: %4.1f\n", durations[7]);
//
// Alternatively, repeating samples and measurements increases the precision:
//
// for (const auto& size_samples : nanobenchmark::RepeatedMeasureWithArguments(
// {3, 3, 4, 4, 7, 7, 8, 8}, [](const size_t size) {
// char from[8] = {static_cast<char>(size)};
// char to[8];
// memcpy(to, from, size);
// return to[0];
// })) {
// nanobenchmark::PrintMedianAndVariability(size_samples);
// }
//
// Output:
// 3: median= 20.8 cycles; median abs. deviation= 0.1 cycles
// 4: median= 8.8 cycles; median abs. deviation= 0.2 cycles
// 7: median= 8.8 cycles; median abs. deviation= 0.2 cycles
// 8: median= 27.5 cycles; median abs. deviation= 0.1 cycles
// (7 is presumably faster because it can use two unaligned 32-bit load/stores.)
//
// Background: Microbenchmarks such as http://github.com/google/benchmark
// can measure elapsed times on the order of a microsecond. Shorter functions
// are typically measured by repeating them thousands of times and dividing
// the total elapsed time by this count. Unfortunately, repetition (especially
// with the same input parameter!) influences the runtime. In time-critical
// code, it is reasonable to expect warm instruction/data caches and TLBs,
// but a perfect record of which branches will be taken is unrealistic.
// Unless the application also repeatedly invokes the measured function with
// the same parameter, the benchmark is measuring something very different -
// a best-case result, almost as if the parameter were made a compile-time
// constant. This may lead to erroneous conclusions about branch-heavy
// algorithms outperforming branch-free alternatives.
//
// Our approach differs in three ways. Adding fences to the timer functions
// reduces variability due to instruction reordering, improving the timer
// resolution to about 10 nanoseconds. However, shorter functions must still
// be invoked repeatedly. For more realistic branch prediction performance,
// we vary the input parameter according to a user-specified distribution.
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
// central tendency of the measurement samples with the "half sample mode",
// which is more robust to outliers and skewed data than the mean or median.
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <map>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
#include "third_party/highwayhash/highwayhash/arch_specific.h"
#include "third_party/highwayhash/highwayhash/code_annotation.h"
#include "third_party/highwayhash/highwayhash/os_specific.h"
#include "third_party/highwayhash/highwayhash/tsc_timer.h"
// Enables sanity checks that verify correct operation at the cost of
// longer benchmark runs.
#ifndef NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_ENABLE_CHECKS 0
#endif
#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
while (!(condition)) { \
printf("Nanobenchmark check failed at line %d\n", __LINE__); \
abort(); \
}
#if NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
#else
#define NANOBENCHMARK_CHECK(condition)
#endif
namespace nanobenchmark {
#if HH_MSC_VERSION
// MSVC does not support inline assembly anymore (and never supported GCC's
// RTL constraints used below), so we instead pass the address to another
// translation unit and assume that link-time code generation is disabled.
void UseCharPointer(volatile const char*);
template <class T>
inline void PreventElision(T&& output) {
UseCharPointer(reinterpret_cast<volatile const char*>(&output));
}
#else
// Convenience function similar to std::enable_if_t (C++14).
// "Condition" provides a bool value, like std::integral_constant.
// Evaluates to a void return type if Condition is true, otherwise removes from
// consideration the function it annotates.
template <typename Condition>
using EnableIf = typename std::enable_if<Condition::value>::type;
// Simplifies the predicates below; we only special-case floats on x86.
#if ARCH_X64
#define NANOBENCHMARK_IS_FLOAT(T) std::is_floating_point<T>::value
#else
#define NANOBENCHMARK_IS_FLOAT(T) false
#endif
// True for T that can efficiently satisfy +r constraints.
template <typename T>
struct IsRegister {
// Exclude member pointers, which may be larger. On x86, also exclude
// floating-point numbers because they reside in separate registers.
static constexpr bool value = std::is_scalar<T>::value &&
!NANOBENCHMARK_IS_FLOAT(T) &&
!std::is_member_pointer<T>::value;
};
// True for all other T that do not need special handling.
template <typename T>
struct IsMemory {
static constexpr bool value =
!NANOBENCHMARK_IS_FLOAT(T) && !IsRegister<T>::value;
};
// Prevents the compiler from eliding the computations that led to "output".
// Works by indicating to the compiler that "output" is being read and modified.
// To avoid unnecessary writes to memory, this should use a +r constraint. That
// fails to compile with T = string/iterator etc, which we also need to protect
// from elision. Always using +m generates unnecessary stores to memory for
// integer/float arguments. +r,+m?? or even +m! should penalize m and only
// choose it if necessary, but that has the same effect as +m on clang.
// We therefore choose between functions specifying +r and +m based on T.
// SFINAE must be applied to the return type instead of the argument so that T
// can be deduced.
template <class T>
inline EnableIf<IsRegister<T>> PreventElision(T&& output) {
asm volatile("" : "+r"(output) : : "memory");
}
// x86: Avoids copying floating-point numbers to memory/eax.
#if defined(__x86_64__) || defined(_M_X64)
template <class T>
inline EnableIf<std::is_floating_point<T>> PreventElision(T&& output) {
// +x = SSE register (used on all x64 for float/double arithmetic).
asm volatile("" : "+x"(output) : : "memory");
}
#endif
template <class T>
inline EnableIf<IsMemory<T>> PreventElision(T&& output) {
// Clang generates redundant stores when using +X (anything) or +g
// (register, memory or immediate).
asm volatile("" : "+m"(output) : : "memory");
}
#endif
// Input parameter for the function being measured.
using Input = size_t;
// Cycles elapsed = difference between two cycle counts. Must be unsigned to
// ensure wraparound on overflow.
using Duration = uint32_t;
// Returns cycles elapsed when passing each of "inputs" (after in-place
// shuffling) to "func", which must return something it has computed
// so the compiler does not optimize it away.
template <typename Func>
Duration CyclesElapsed(const Duration resolution, const Func& func,
std::vector<Input>* inputs) {
// This benchmark attempts to measure the performance of "func" when
// called with realistic inputs, which we assume are randomly drawn
// from the given "inputs" distribution, so we shuffle those values.
std::random_shuffle(inputs->begin(), inputs->end());
const Duration t0 = tsc_timer::Start<Duration>();
for (const Input input : *inputs) {
PreventElision(func(input));
}
const Duration t1 = tsc_timer::Stop<Duration>();
const Duration elapsed = t1 - t0;
NANOBENCHMARK_CHECK(elapsed > resolution);
return elapsed - resolution;
}
// Stores input values for a series of calls to the function to measure.
// We assume inputs are drawn from a known discrete probability distribution,
// modeled as a vector<Input> v. The probability of a value X in v is
// count(v.begin(), v.end(), X) / v.size().
//
// Parameterizing the entire class on Func avoids std::function overhead or
// requiring users to call InitReplicas. Code size is not a major concern.
template <typename Func>
class Inputs {
Inputs(const Inputs&) = delete;
Inputs& operator=(const Inputs&) = delete;
public:
Inputs(const Duration resolution, const std::vector<Input>& distribution,
const Func& func)
: unique_(InitUnique(distribution)),
replicas_(InitReplicas(distribution, resolution, func)),
num_replicas_(replicas_.size() / distribution.size()) {
printf("NumReplicas %zu\n", num_replicas_);
}
// Returns vector of the unique values from the input distribution.
const std::vector<Input>& Unique() const { return unique_; }
// Returns how many instances of "distribution" are in "replicas_", i.e.
// the number of occurrences of an input value that occurred only once
// in the distribution. This is the divisor for computing the duration
// of a single call.
size_t NumReplicas() const { return num_replicas_; }
// Returns the (replicated) input distribution. Modified by caller
// (shuffled in-place) => not thread-safe.
std::vector<Input>& Replicas() { return replicas_; }
// Returns a copy of Replicas() with NumReplicas() occurrences of "input"
// removed. Used for the leave-one-out measurement.
std::vector<Input> Without(const Input input_to_remove) const {
// "input_to_remove" should be in the original distribution.
NANOBENCHMARK_CHECK(std::find(unique_.begin(), unique_.end(),
input_to_remove) != unique_.end());
std::vector<Input> copy = replicas_;
auto pos = std::partition(copy.begin(), copy.end(),
[input_to_remove](const Input input) {
return input_to_remove != input;
});
// Must occur at least num_replicas_ times.
NANOBENCHMARK_CHECK(copy.end() - pos >= num_replicas_);
// (Avoids unused-variable warning.)
PreventElision(pos);
copy.resize(copy.size() - num_replicas_);
return copy;
}
private:
// Returns a copy with any duplicate values removed. Initializing unique_
// through this function allows it to be const.
static std::vector<Input> InitUnique(const std::vector<Input>& distribution) {
std::vector<Input> unique = distribution;
std::sort(unique.begin(), unique.end());
unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
// Our leave-one-out measurement technique only makes sense when
// there are multiple input values.
NANOBENCHMARK_CHECK(unique.size() >= 2);
return unique;
}
// Returns how many replicas of "distribution" are required before
// CyclesElapsed is large enough compared to the timer resolution.
static std::vector<Input> InitReplicas(const std::vector<Input>& distribution,
const Duration resolution,
const Func& func) {
// We compute the difference in duration for inputs = Replicas() vs.
// Without(). Dividing this by num_replicas must yield a value where the
// quantization error (from the timer resolution) is sufficiently small.
const uint64_t min_elapsed = distribution.size() * resolution * 400;
std::vector<Input> replicas;
for (;;) {
AppendReplica(distribution, &replicas);
#if NANOBENCHMARK_ENABLE_CHECKS
const uint64_t t0 = tsc_timer::Start64();
#endif
const Duration elapsed = CyclesElapsed(resolution, func, &replicas);
#if NANOBENCHMARK_ENABLE_CHECKS
const uint64_t t1 = tsc_timer::Stop64();
#endif
// Ensure the 32-bit timer didn't and won't overflow.
NANOBENCHMARK_CHECK((t1 - t0) < (1ULL << 30));
if (elapsed >= min_elapsed) {
return replicas;
}
}
}
// Appends all values in "distribution" to "replicas".
static void AppendReplica(const std::vector<Input>& distribution,
std::vector<Input>* replicas) {
replicas->reserve(replicas->size() + distribution.size());
for (const Input input : distribution) {
replicas->push_back(input);
}
}
const std::vector<Input> unique_;
// Modified by caller (shuffled in-place) => non-const.
std::vector<Input> replicas_;
// Initialized from replicas_.
const size_t num_replicas_;
};
// Holds samples of measured durations, and (robustly) reduces them to a
// single result for each unique input value.
class DurationSamples {
public:
DurationSamples(const std::vector<Input>& unique_inputs,
const size_t num_samples)
: num_samples_(num_samples) {
// Preallocate storage.
for (const Input input : unique_inputs) {
samples_for_input_[input].reserve(num_samples);
}
}
void Add(const Input input, const Duration sample) {
// "input" should be one of the values passed to the ctor.
NANOBENCHMARK_CHECK(samples_for_input_.find(input) !=
samples_for_input_.end());
samples_for_input_[input].push_back(sample);
}
// Invokes "lambda" for each (input, duration) pair. The per-call duration
// is the central tendency (the mode) of the samples.
template <class Lambda>
void Reduce(const Lambda& lambda) {
for (auto& input_and_samples : samples_for_input_) {
const Input input = input_and_samples.first;
std::vector<Duration>& samples = input_and_samples.second;
NANOBENCHMARK_CHECK(samples.size() <= num_samples_);
std::sort(samples.begin(), samples.end());
const Duration duration = tsc_timer::Mode(samples.data(), samples.size());
lambda(input, duration);
}
}
private:
const size_t num_samples_;
std::map<Input, std::vector<Duration>> samples_for_input_;
};
// Gathers "num_samples" durations via repeated leave-one-out measurements.
template <typename Func>
DurationSamples GatherDurationSamples(const Duration resolution,
Inputs<Func>& inputs, const Func& func,
const size_t num_samples) {
DurationSamples samples(inputs.Unique(), num_samples);
for (size_t i = 0; i < num_samples; ++i) {
// Total duration for all shuffled input values. This may change over time,
// so recompute it for each sample.
const Duration total = CyclesElapsed(resolution, func, &inputs.Replicas());
for (const Input input : inputs.Unique()) {
// To isolate the durations of the calls with this input value,
// we measure the duration without those values and subtract that
// from the total, and later divide by NumReplicas.
std::vector<Input> without = inputs.Without(input);
for (int rep = 0; rep < 3; ++rep) {
const Duration elapsed = CyclesElapsed(resolution, func, &without);
if (elapsed < total) {
samples.Add(input, total - elapsed);
break;
}
}
}
}
return samples;
}
// Public API follows:
// Returns measurements of the cycles elapsed when calling "func" with each
// unique input value from the given "distribution", taking special care to
// maintain realistic branch prediction hit rates.
//
// "distribution" should contain the input values required to trigger both
// sides of all conditional branches in "func". A value's probability of
// being used in the real application should be proportional to its frequency
// in the vector. Order does not matter; for example, a uniform distribution
// over [0, 4) could be represented as {3,0,2,1}. The benchmark duration is
// proportional to |distribution| * |unique values|. Repeating each value at
// least once ensures the leave-one-out distribution is closer to the original
// distribution, leading to more realistic results.
//
// "func" acts like std::function<T(Input)>, where T is a 'proof of work'
// return value used to ensure the computations are not elided.
template <typename Func>
std::map<Input, float> MeasureWithArguments(
const std::vector<Input>& distribution, const Func& func) {
const Duration resolution = tsc_timer::Resolution<Duration>();
// Adds enough 'replicas' of the distribution to measure "func" given
// the timer resolution.
Inputs<Func> inputs(resolution, distribution, func);
const double per_call = 1.0 / static_cast<int>(inputs.NumReplicas());
auto samples = GatherDurationSamples(resolution, inputs, func, 1024);
// Return map of duration [cycles] for every unique input value.
std::map<Input, float> durations;
samples.Reduce(
[&durations, per_call](const Input input, const Duration duration) {
durations[input] = static_cast<float>(duration * per_call);
});
NANOBENCHMARK_CHECK(durations.size() == inputs.Unique().size());
return durations;
}
// Optional functions for robust pooling of multiple measurements:
// Returns vectors of duration samples for each input, for subsequent analysis
// by Median/MedianAbsoluteDeviation. The parameters are documented in
// MeasureWithArguments.
template <typename Func>
std::map<Input, std::vector<float>> RepeatedMeasureWithArguments(
const std::vector<Input>& distribution, const Func& func,
const int repetitions = 25) {
const Duration resolution = tsc_timer::Resolution<Duration>();
// Adds enough 'replicas' of the distribution to measure "func" given
// the timer resolution.
Inputs<Func> inputs(resolution, distribution, func);
const double per_call = 1.0 / static_cast<int>(inputs.NumReplicas());
// Preallocate sample storage.
std::map<Input, std::vector<float>> samples_for_input;
for (const Input input : distribution) {
samples_for_input[input].reserve(repetitions);
}
for (int i = 0; i < repetitions; ++i) {
auto samples = GatherDurationSamples(resolution, inputs, func, 512);
// Scatter each input's duration into the sample arrays.
samples.Reduce([&samples_for_input, per_call](const Input input,
const Duration duration) {
const float sample = static_cast<float>(duration * per_call);
samples_for_input[input].push_back(sample);
});
}
return samples_for_input;
}
// Returns the median value. Side effect: sorts "samples".
template <typename T>
T Median(std::vector<T>* samples) {
NANOBENCHMARK_CHECK(!samples->empty());
std::sort(samples->begin(), samples->end());
const size_t half = samples->size() / 2;
// Odd count: return middle
if (samples->size() % 2) {
return (*samples)[half];
}
// Even count: return average of middle two.
return ((*samples)[half] + (*samples)[half - 1]) / 2;
}
// Returns a robust measure of variability.
template <typename T>
T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) {
NANOBENCHMARK_CHECK(!samples.empty());
std::vector<T> abs_deviations;
abs_deviations.reserve(samples.size());
for (const T sample : samples) {
abs_deviations.push_back(std::abs(sample - median));
}
return Median(&abs_deviations);
}
// Print median duration and variability for this size's samples.
inline void PrintMedianAndVariability(
const std::pair<Input, std::vector<float>>& input_samples) {
const Input input = input_samples.first;
auto samples = input_samples.second; // Copy (modified by Median)
const float median = Median(&samples);
const float variability = MedianAbsoluteDeviation(samples, median);
printf("%5zu: median=%5.1f cycles; median abs. deviation=%4.1f cycles\n",
input, median, variability);
}
} // namespace nanobenchmark
#endif // HIGHWAYHASH_HIGHWAYHASH_NANOBENCHMARK_H_