trace_processor: Add cache to ClockTracker
Makes simple clock tracking cases faster.
Doubles the importing speed of chrome traces. I cannot see ClockTracker
in perf profiles anymore.
Change-Id: I5f39b81da440dcc6502cb479b6e8a6070e2e6f4e
diff --git a/src/trace_processor/clock_tracker.cc b/src/trace_processor/clock_tracker.cc
index 696a110..43a6819 100644
--- a/src/trace_processor/clock_tracker.cc
+++ b/src/trace_processor/clock_tracker.cc
@@ -41,6 +41,10 @@
void ClockTracker::AddSnapshot(const std::vector<ClockValue>& clocks) {
const auto snapshot_id = cur_snapshot_id_++;
+ // Clear the cache
+ static_assert(std::is_trivial<decltype(cache_)>::value, "must be trivial");
+ memset(&cache_[0], 0, sizeof(cache_));
+
// Compute the fingerprint of the snapshot by hashing all clock ids. This is
// used by the clock pathfinding logic.
base::Hash hasher;
@@ -192,25 +196,33 @@
return ClockPath(); // invalid path.
}
-base::Optional<int64_t> ClockTracker::Convert(ClockId src_clock_id,
- int64_t src_timestamp,
- ClockId target_clock_id) {
- // TODO(primiano): optimization: I bet A simple LRU cache of the form
- // (src_clock_id, target_clock_id, latest_timestamp, translation_ns) might
- // speed up most conversion allowing to skip FindPath and the iterations.
-
+base::Optional<int64_t> ClockTracker::ConvertSlowpath(ClockId src_clock_id,
+ int64_t src_timestamp,
+ ClockId target_clock_id) {
PERFETTO_DCHECK(!IsReservedSeqScopedClockId(src_clock_id));
PERFETTO_DCHECK(!IsReservedSeqScopedClockId(target_clock_id));
+ context_->storage->IncrementStats(stats::clock_sync_cache_miss);
+
ClockPath path = FindPath(src_clock_id, target_clock_id);
if (!path.valid()) {
context_->storage->IncrementStats(stats::clock_sync_failure);
return base::nullopt;
}
+ // We can cache only single-path resolutions between two clocks.
+ // Caching multi-path resolutions is harder because the (src,target) tuple
+ // is not enough as a cache key: at any step the |ns| value can yield to a
+ // different choice of the next snapshot. Multi-path resolutions don't seem
+ // too frequent these days, so we focus only on caching the more frequent
+ // one-step resolutions (typically from any clock to the trace clock).
+ const bool cacheable = path.len == 1;
+ CachedClockPath cache_entry{};
+
// Iterate trough the path found and translate timestamps onto the new clock
// domain on each step, until the target domain is reached.
- int64_t ns = GetClock(src_clock_id)->ToNs(src_timestamp);
+ ClockDomain* src_domain = GetClock(src_clock_id);
+ int64_t ns = src_domain->ToNs(src_timestamp);
for (uint32_t i = 0; i < path.len; ++i) {
const ClockGraphEdge edge = path.at(i);
ClockDomain* cur_clock = GetClock(std::get<0>(edge));
@@ -246,12 +258,33 @@
// The translated timestamp is the relative delta of the source timestamp
// from the closest snapshot found (ns - *it), plus the timestamp in
// the new clock domain for the same snapshot id.
- ns = (ns - *it) + next_timestamp_ns;
+ const int64_t adj = next_timestamp_ns - *it;
+ ns += adj;
+
+ // On the first iteration, keep track of the bounds for the cache entry.
+ // This will allow future Convert() calls to skip the pathfinder logic
+ // as long as the query stays within the bound.
+ if (cacheable) {
+ PERFETTO_DCHECK(i == 0);
+ const int64_t kInt64Min = std::numeric_limits<int64_t>::min();
+ const int64_t kInt64Max = std::numeric_limits<int64_t>::max();
+ cache_entry.min_ts_ns = it == ts_vec.begin() ? kInt64Min : *it;
+ auto ubound = it + 1;
+ cache_entry.max_ts_ns = ubound == ts_vec.end() ? kInt64Max : *ubound;
+ cache_entry.translation_ns = adj;
+ }
// The last clock in the path must be the target clock.
PERFETTO_DCHECK(i < path.len - 1 || std::get<1>(edge) == target_clock_id);
}
+ if (cacheable) {
+ cache_entry.src = src_clock_id;
+ cache_entry.src_domain = src_domain;
+ cache_entry.target = target_clock_id;
+ cache_[rnd_() % cache_.size()] = cache_entry;
+ }
+
return ns;
}
diff --git a/src/trace_processor/clock_tracker.h b/src/trace_processor/clock_tracker.h
index c261249..63fe8e1 100644
--- a/src/trace_processor/clock_tracker.h
+++ b/src/trace_processor/clock_tracker.h
@@ -21,6 +21,7 @@
#include <array>
#include <map>
+#include <random>
#include <set>
#include <vector>
@@ -154,9 +155,27 @@
// This is typically called by the code that reads the ClockSnapshot packet.
void AddSnapshot(const std::vector<ClockValue>&);
+ // Converts a timestamp between two clock domains. Tries to use the cache
+ // first (only for single-path resolutions), then falls back on path finding
+ // as described in the header.
base::Optional<int64_t> Convert(ClockId src_clock_id,
int64_t src_timestamp,
- ClockId target_clock_id);
+ ClockId target_clock_id) {
+ if (PERFETTO_LIKELY(!cache_lookups_disabled_for_testing_)) {
+ for (const auto& ce : cache_) {
+ if (ce.src != src_clock_id || ce.target != target_clock_id)
+ continue;
+ int64_t ns = ce.src_domain->ToNs(src_timestamp);
+ if (ns >= ce.min_ts_ns && ns < ce.max_ts_ns)
+ return ns + ce.translation_ns;
+ }
+ }
+ return ConvertSlowpath(src_clock_id, src_timestamp, target_clock_id);
+ }
+
+ base::Optional<int64_t> ConvertSlowpath(ClockId src_clock_id,
+ int64_t src_timestamp,
+ ClockId target_clock_id);
base::Optional<int64_t> ToTraceTime(ClockId clock_id, int64_t timestamp) {
if (clock_id == trace_time_clock_id_)
@@ -169,6 +188,10 @@
trace_time_clock_id_ = clock_id;
}
+ void set_cache_lookups_disabled_for_testing(bool v) {
+ cache_lookups_disabled_for_testing_ = v;
+ }
+
private:
using SnapshotHash = uint32_t;
@@ -245,6 +268,17 @@
}
};
+ // Holds data for cached entries. At the moment only single-path resolution
+ // are cached.
+ struct CachedClockPath {
+ ClockId src;
+ ClockId target;
+ ClockDomain* src_domain;
+ int64_t min_ts_ns;
+ int64_t max_ts_ns;
+ int64_t translation_ns;
+ };
+
ClockTracker(const ClockTracker&) = delete;
ClockTracker& operator=(const ClockTracker&) = delete;
@@ -261,6 +295,9 @@
std::map<ClockId, ClockDomain> clocks_;
std::set<ClockGraphEdge> graph_;
std::set<ClockId> non_monotonic_clocks_;
+ std::array<CachedClockPath, 2> cache_{};
+ bool cache_lookups_disabled_for_testing_ = false;
+ std::minstd_rand rnd_; // For cache eviction.
uint32_t cur_snapshot_id_ = 0;
};
diff --git a/src/trace_processor/clock_tracker_unittest.cc b/src/trace_processor/clock_tracker_unittest.cc
index e1628af..2fc8e59 100644
--- a/src/trace_processor/clock_tracker_unittest.cc
+++ b/src/trace_processor/clock_tracker_unittest.cc
@@ -16,6 +16,8 @@
#include "src/trace_processor/clock_tracker.h"
+#include <random>
+
#include "perfetto/ext/base/optional.h"
#include "src/trace_processor/trace_processor_context.h"
#include "src/trace_processor/trace_storage.h"
@@ -207,6 +209,55 @@
EXPECT_EQ(*ct_.ToTraceTime(c66_2, 4 /* abs 30 */), 129000);
}
+// Tests that the cache doesn't affect the results of Convert() in unexpected
+// ways.
+TEST_F(ClockTrackerTest, CacheDoesntAffectResults) {
+ std::minstd_rand rnd;
+ int last_mono = 0;
+ int last_boot = 0;
+ int last_raw = 0;
+ static const int increments[] = {1, 2, 10};
+ for (int i = 0; i < 1000; i++) {
+ last_mono += increments[rnd() % base::ArraySize(increments)];
+ last_boot += increments[rnd() % base::ArraySize(increments)];
+ ct_.AddSnapshot({{MONOTONIC, last_mono}, {BOOTTIME, last_boot}});
+
+ last_raw += increments[rnd() % base::ArraySize(increments)];
+ last_boot += increments[rnd() % base::ArraySize(increments)];
+ ct_.AddSnapshot({{MONOTONIC_RAW, last_raw}, {BOOTTIME, last_boot}});
+ }
+
+ for (int i = 0; i < 1000; i++) {
+ int64_t val = static_cast<int64_t>(rnd()) % 10000;
+ for (int j = 0; j < 5; j++) {
+ ClockTracker::ClockId src;
+ ClockTracker::ClockId tgt;
+ if (j == 0) {
+ std::tie(src, tgt) = std::make_tuple(MONOTONIC, BOOTTIME);
+ } else if (j == 1) {
+ std::tie(src, tgt) = std::make_tuple(MONOTONIC_RAW, BOOTTIME);
+ } else if (j == 2) {
+ std::tie(src, tgt) = std::make_tuple(BOOTTIME, MONOTONIC);
+ } else if (j == 3) {
+ std::tie(src, tgt) = std::make_tuple(BOOTTIME, MONOTONIC_RAW);
+ } else if (j == 4) {
+ std::tie(src, tgt) = std::make_tuple(MONOTONIC_RAW, MONOTONIC);
+ } else {
+ PERFETTO_FATAL("j out of bounds");
+ }
+ // It will still write the cache, just not lookup.
+ ct_.set_cache_lookups_disabled_for_testing(true);
+ auto not_cached = ct_.Convert(src, val, tgt);
+
+ // This should 100% hit the cache.
+ ct_.set_cache_lookups_disabled_for_testing(false);
+ auto cached = ct_.Convert(src, val, tgt);
+
+ ASSERT_EQ(not_cached, cached);
+ }
+ }
+}
+
} // namespace
} // namespace trace_processor
} // namespace perfetto
diff --git a/src/trace_processor/stats.h b/src/trace_processor/stats.h
index 8d52968..7e5c8bc 100644
--- a/src/trace_processor/stats.h
+++ b/src/trace_processor/stats.h
@@ -108,6 +108,7 @@
F(vmstat_unknown_keys, kSingle, kError, kAnalysis), \
F(vulkan_allocations_invalid_string_id, kSingle, kError, kTrace), \
F(clock_sync_failure, kSingle, kError, kAnalysis), \
+ F(clock_sync_cache_miss, kSingle, kInfo, kAnalysis), \
F(process_tracker_errors, kSingle, kError, kAnalysis), \
F(json_tokenizer_failure, kSingle, kError, kTrace), \
F(heap_graph_invalid_string_id, kIndexed, kError, kTrace), \