trace_processor: Add cache to ClockTracker

Makes simple clock tracking cases faster.
Doubles the importing speed of chrome traces. I cannot see ClockTracker
in perf profiles anymore.

Change-Id: I5f39b81da440dcc6502cb479b6e8a6070e2e6f4e
diff --git a/src/trace_processor/clock_tracker.cc b/src/trace_processor/clock_tracker.cc
index 696a110..43a6819 100644
--- a/src/trace_processor/clock_tracker.cc
+++ b/src/trace_processor/clock_tracker.cc
@@ -41,6 +41,10 @@
 void ClockTracker::AddSnapshot(const std::vector<ClockValue>& clocks) {
   const auto snapshot_id = cur_snapshot_id_++;
 
+  // Clear the cache
+  static_assert(std::is_trivial<decltype(cache_)>::value, "must be trivial");
+  memset(&cache_[0], 0, sizeof(cache_));
+
   // Compute the fingerprint of the snapshot by hashing all clock ids. This is
   // used by the clock pathfinding logic.
   base::Hash hasher;
@@ -192,25 +196,33 @@
   return ClockPath();  // invalid path.
 }
 
-base::Optional<int64_t> ClockTracker::Convert(ClockId src_clock_id,
-                                              int64_t src_timestamp,
-                                              ClockId target_clock_id) {
-  // TODO(primiano): optimization: I bet A simple LRU cache of the form
-  // (src_clock_id, target_clock_id, latest_timestamp, translation_ns) might
-  // speed up most conversion allowing to skip FindPath and the iterations.
-
+base::Optional<int64_t> ClockTracker::ConvertSlowpath(ClockId src_clock_id,
+                                                      int64_t src_timestamp,
+                                                      ClockId target_clock_id) {
   PERFETTO_DCHECK(!IsReservedSeqScopedClockId(src_clock_id));
   PERFETTO_DCHECK(!IsReservedSeqScopedClockId(target_clock_id));
 
+  context_->storage->IncrementStats(stats::clock_sync_cache_miss);
+
   ClockPath path = FindPath(src_clock_id, target_clock_id);
   if (!path.valid()) {
     context_->storage->IncrementStats(stats::clock_sync_failure);
     return base::nullopt;
   }
 
+  // We can cache only single-path resolutions between two clocks.
+  // Caching multi-path resolutions is harder because the (src,target) tuple
+  // is not enough as a cache key: at any step the |ns| value can yield to a
+  // different choice of the next snapshot. Multi-path resolutions don't seem
+  // too frequent these days, so we focus only on caching the more frequent
+  // one-step resolutions (typically from any clock to the trace clock).
+  const bool cacheable = path.len == 1;
+  CachedClockPath cache_entry{};
+
   // Iterate trough the path found and translate timestamps onto the new clock
   // domain on each step, until the target domain is reached.
-  int64_t ns = GetClock(src_clock_id)->ToNs(src_timestamp);
+  ClockDomain* src_domain = GetClock(src_clock_id);
+  int64_t ns = src_domain->ToNs(src_timestamp);
   for (uint32_t i = 0; i < path.len; ++i) {
     const ClockGraphEdge edge = path.at(i);
     ClockDomain* cur_clock = GetClock(std::get<0>(edge));
@@ -246,12 +258,33 @@
     // The translated timestamp is the relative delta of the source timestamp
     // from the closest snapshot found (ns - *it), plus the timestamp in
     // the new clock domain for the same snapshot id.
-    ns = (ns - *it) + next_timestamp_ns;
+    const int64_t adj = next_timestamp_ns - *it;
+    ns += adj;
+
+    // On the first iteration, keep track of the bounds for the cache entry.
+    // This will allow future Convert() calls to skip the pathfinder logic
+    // as long as the query stays within the bound.
+    if (cacheable) {
+      PERFETTO_DCHECK(i == 0);
+      const int64_t kInt64Min = std::numeric_limits<int64_t>::min();
+      const int64_t kInt64Max = std::numeric_limits<int64_t>::max();
+      cache_entry.min_ts_ns = it == ts_vec.begin() ? kInt64Min : *it;
+      auto ubound = it + 1;
+      cache_entry.max_ts_ns = ubound == ts_vec.end() ? kInt64Max : *ubound;
+      cache_entry.translation_ns = adj;
+    }
 
     // The last clock in the path must be the target clock.
     PERFETTO_DCHECK(i < path.len - 1 || std::get<1>(edge) == target_clock_id);
   }
 
+  if (cacheable) {
+    cache_entry.src = src_clock_id;
+    cache_entry.src_domain = src_domain;
+    cache_entry.target = target_clock_id;
+    cache_[rnd_() % cache_.size()] = cache_entry;
+  }
+
   return ns;
 }
 
diff --git a/src/trace_processor/clock_tracker.h b/src/trace_processor/clock_tracker.h
index c261249..63fe8e1 100644
--- a/src/trace_processor/clock_tracker.h
+++ b/src/trace_processor/clock_tracker.h
@@ -21,6 +21,7 @@
 
 #include <array>
 #include <map>
+#include <random>
 #include <set>
 #include <vector>
 
@@ -154,9 +155,27 @@
   // This is typically called by the code that reads the ClockSnapshot packet.
   void AddSnapshot(const std::vector<ClockValue>&);
 
+  // Converts a timestamp between two clock domains. Tries to use the cache
+  // first (only for single-path resolutions), then falls back on path finding
+  // as described in the header.
   base::Optional<int64_t> Convert(ClockId src_clock_id,
                                   int64_t src_timestamp,
-                                  ClockId target_clock_id);
+                                  ClockId target_clock_id) {
+    if (PERFETTO_LIKELY(!cache_lookups_disabled_for_testing_)) {
+      for (const auto& ce : cache_) {
+        if (ce.src != src_clock_id || ce.target != target_clock_id)
+          continue;
+        int64_t ns = ce.src_domain->ToNs(src_timestamp);
+        if (ns >= ce.min_ts_ns && ns < ce.max_ts_ns)
+          return ns + ce.translation_ns;
+      }
+    }
+    return ConvertSlowpath(src_clock_id, src_timestamp, target_clock_id);
+  }
+
+  base::Optional<int64_t> ConvertSlowpath(ClockId src_clock_id,
+                                          int64_t src_timestamp,
+                                          ClockId target_clock_id);
 
   base::Optional<int64_t> ToTraceTime(ClockId clock_id, int64_t timestamp) {
     if (clock_id == trace_time_clock_id_)
@@ -169,6 +188,10 @@
     trace_time_clock_id_ = clock_id;
   }
 
+  void set_cache_lookups_disabled_for_testing(bool v) {
+    cache_lookups_disabled_for_testing_ = v;
+  }
+
  private:
   using SnapshotHash = uint32_t;
 
@@ -245,6 +268,17 @@
     }
   };
 
+  // Holds data for cached entries. At the moment only single-path resolution
+  // are cached.
+  struct CachedClockPath {
+    ClockId src;
+    ClockId target;
+    ClockDomain* src_domain;
+    int64_t min_ts_ns;
+    int64_t max_ts_ns;
+    int64_t translation_ns;
+  };
+
   ClockTracker(const ClockTracker&) = delete;
   ClockTracker& operator=(const ClockTracker&) = delete;
 
@@ -261,6 +295,9 @@
   std::map<ClockId, ClockDomain> clocks_;
   std::set<ClockGraphEdge> graph_;
   std::set<ClockId> non_monotonic_clocks_;
+  std::array<CachedClockPath, 2> cache_{};
+  bool cache_lookups_disabled_for_testing_ = false;
+  std::minstd_rand rnd_;  // For cache eviction.
   uint32_t cur_snapshot_id_ = 0;
 };
 
diff --git a/src/trace_processor/clock_tracker_unittest.cc b/src/trace_processor/clock_tracker_unittest.cc
index e1628af..2fc8e59 100644
--- a/src/trace_processor/clock_tracker_unittest.cc
+++ b/src/trace_processor/clock_tracker_unittest.cc
@@ -16,6 +16,8 @@
 
 #include "src/trace_processor/clock_tracker.h"
 
+#include <random>
+
 #include "perfetto/ext/base/optional.h"
 #include "src/trace_processor/trace_processor_context.h"
 #include "src/trace_processor/trace_storage.h"
@@ -207,6 +209,55 @@
   EXPECT_EQ(*ct_.ToTraceTime(c66_2, 4 /* abs 30 */), 129000);
 }
 
+// Tests that the cache doesn't affect the results of Convert() in unexpected
+// ways.
+TEST_F(ClockTrackerTest, CacheDoesntAffectResults) {
+  std::minstd_rand rnd;
+  int last_mono = 0;
+  int last_boot = 0;
+  int last_raw = 0;
+  static const int increments[] = {1, 2, 10};
+  for (int i = 0; i < 1000; i++) {
+    last_mono += increments[rnd() % base::ArraySize(increments)];
+    last_boot += increments[rnd() % base::ArraySize(increments)];
+    ct_.AddSnapshot({{MONOTONIC, last_mono}, {BOOTTIME, last_boot}});
+
+    last_raw += increments[rnd() % base::ArraySize(increments)];
+    last_boot += increments[rnd() % base::ArraySize(increments)];
+    ct_.AddSnapshot({{MONOTONIC_RAW, last_raw}, {BOOTTIME, last_boot}});
+  }
+
+  for (int i = 0; i < 1000; i++) {
+    int64_t val = static_cast<int64_t>(rnd()) % 10000;
+    for (int j = 0; j < 5; j++) {
+      ClockTracker::ClockId src;
+      ClockTracker::ClockId tgt;
+      if (j == 0) {
+        std::tie(src, tgt) = std::make_tuple(MONOTONIC, BOOTTIME);
+      } else if (j == 1) {
+        std::tie(src, tgt) = std::make_tuple(MONOTONIC_RAW, BOOTTIME);
+      } else if (j == 2) {
+        std::tie(src, tgt) = std::make_tuple(BOOTTIME, MONOTONIC);
+      } else if (j == 3) {
+        std::tie(src, tgt) = std::make_tuple(BOOTTIME, MONOTONIC_RAW);
+      } else if (j == 4) {
+        std::tie(src, tgt) = std::make_tuple(MONOTONIC_RAW, MONOTONIC);
+      } else {
+        PERFETTO_FATAL("j out of bounds");
+      }
+      // It will still write the cache, just not lookup.
+      ct_.set_cache_lookups_disabled_for_testing(true);
+      auto not_cached = ct_.Convert(src, val, tgt);
+
+      // This should 100% hit the cache.
+      ct_.set_cache_lookups_disabled_for_testing(false);
+      auto cached = ct_.Convert(src, val, tgt);
+
+      ASSERT_EQ(not_cached, cached);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace trace_processor
 }  // namespace perfetto
diff --git a/src/trace_processor/stats.h b/src/trace_processor/stats.h
index 8d52968..7e5c8bc 100644
--- a/src/trace_processor/stats.h
+++ b/src/trace_processor/stats.h
@@ -108,6 +108,7 @@
   F(vmstat_unknown_keys,                      kSingle,  kError,    kAnalysis), \
   F(vulkan_allocations_invalid_string_id,     kSingle,  kError,    kTrace),    \
   F(clock_sync_failure,                       kSingle,  kError,    kAnalysis), \
+  F(clock_sync_cache_miss,                    kSingle,  kInfo,     kAnalysis), \
   F(process_tracker_errors,                   kSingle,  kError,    kAnalysis), \
   F(json_tokenizer_failure,                   kSingle,  kError,    kTrace),    \
   F(heap_graph_invalid_string_id,             kIndexed, kError,    kTrace),    \