#2080: Fix rate and thread rate counter aggregates (#2081)

* Update counter.cc

* User counters: normalize time by thread count

Fixes https://github.com/google/benchmark/issues/2080

* docs

---------

Co-authored-by: Roman Lebedev <lebedev.ri@gmail.com>
diff --git a/docs/user_guide.md b/docs/user_guide.md
index ae8e125..72fb682 100644
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
@@ -749,10 +749,6 @@
 conversion to `double&`. Thus you can use all of the standard arithmetic
 assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
 
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
 The `Counter` constructor accepts three parameters: the value as a `double`
 ; a bit flag which allows you to show counters as rates, and/or as per-thread
 iteration, and/or as per-thread averages, and/or iteration invariants,
@@ -797,6 +793,10 @@
 ```
 <!-- {% endraw %} -->
 
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed.
+Counters that are configured with `kIsRate`, will report the average rate across all threads, while `kAvgThreadsRate` counters will report the average rate per thread.
+
 ### Counter Reporting
 
 When using the console reporter, by default, user counters are printed at
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index 3cfa8a4..d8a2357 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -123,7 +123,12 @@
               : 0;
     }
 
-    internal::Finish(&report.counters, results.iterations, seconds,
+    // The CPU time is the total time taken by all thread. If we used that as
+    // the denominator, we'd be calculating the rate per thread here. This is
+    // why we have to divide the total cpu_time by the number of threads for
+    // global counters to get a global rate.
+    const double thread_seconds = seconds / b.threads();
+    internal::Finish(&report.counters, results.iterations, thread_seconds,
                      b.threads());
   }
   return report;
diff --git a/test/user_counters_tabular_test.cc b/test/user_counters_tabular_test.cc
index d53ffdc..7db0e20 100644
--- a/test/user_counters_tabular_test.cc
+++ b/test/user_counters_tabular_test.cc
@@ -418,7 +418,7 @@
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckTabularRate(Results const& e) {
-  double t = e.DurationCPUTime();
+  double t = e.DurationCPUTime() / e.NumThreads();
   CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
index c55ad98..a8af087 100644
--- a/test/user_counters_test.cc
+++ b/test/user_counters_test.cc
@@ -381,8 +381,10 @@
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgThreadsRate(Results const& e) {
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / e.DurationCPUTime(), 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / e.DurationCPUTime(), 0.001);
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
                         &CheckAvgThreadsRate);
diff --git a/test/user_counters_threads_test.cc b/test/user_counters_threads_test.cc
index 027773e..e2e5ade 100644
--- a/test/user_counters_threads_test.cc
+++ b/test/user_counters_threads_test.cc
@@ -107,7 +107,8 @@
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckBytesAndItemsPSec(Results const& e) {
-  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
   CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1 * e.NumThreads());
   // check that the values are within 0.1% of the expected values
   CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ,
@@ -158,7 +159,8 @@
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckRate(Results const& e) {
-  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
   // check that the values are within 0.1% of the expected values
   CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, (1. * e.NumThreads()) / t, 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, (2. * e.NumThreads()) / t, 0.001);
@@ -258,7 +260,8 @@
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckInvertedRate(Results const& e) {
-  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
   // check that the values are within 0.1% of the expected values
   CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, t / (e.NumThreads()), 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, t / (8192.0 * e.NumThreads()), 0.001);
@@ -394,8 +397,10 @@
 // VS2013 does not allow this function to be passed as a lambda argument
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgThreadsRate(Results const& e) {
-  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / e.DurationCPUTime(), 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / e.DurationCPUTime(), 0.001);
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
 }
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
                         &CheckAvgThreadsRate);
@@ -496,7 +501,8 @@
 // to CHECK_BENCHMARK_RESULTS()
 void CheckIsIterationInvariantRate(Results const& e) {
   double its = e.NumIterations();
-  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
   // check that the values are within 0.1% of the expected values
   CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its * 1. * e.NumThreads() / t, 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, its * 2. * e.NumThreads() / t, 0.001);
@@ -596,7 +602,8 @@
 // to CHECK_BENCHMARK_RESULTS()
 void CheckAvgIterationsRate(Results const& e) {
   double its = e.NumIterations();
-  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // this (and not real time) is the time used
+  double t = e.DurationCPUTime() / e.NumThreads();
   // check that the values are within 0.1% of the expected values
   CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. * e.NumThreads() / its / t, 0.001);
   CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * e.NumThreads() / its / t, 0.001);