blob: 5e123b9b781fff0a3f04603166e8b2d597dab715 [file] [log] [blame]
// Copyright 2023 The Centipede Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "./centipede/rolling_hash.h"
#include <stdlib.h>
#include <sys/types.h>
#include <bitset>
#include <cstddef>
#include <deque>
#include <memory>
#include <numeric>
#include <random>
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "./centipede/feature.h"
namespace centipede {
namespace {
// Reference implementation for RollingHash.
// Maintains the entire window of hash-ed ints in memory.
// Otherwise, equivalent to RollingHash.
class TestOnlyRollingHash {
public:
void Reset(size_t window_size) {
window_size_ = window_size;
deq_.clear();
}
void Update(uint32_t add, uint32_t remove) {
deq_.push_back(add);
if (deq_.size() > window_size_) deq_.pop_front();
}
uint32_t Hash() const {
uint64_t hash = 0;
for (const auto &value : deq_) {
hash = RollingHash::TestOnlyUpdate(hash, value);
}
return hash;
}
protected:
size_t window_size_ = 0;
std::deque<uint32_t> deq_;
};
// Tests RollingHashType, compares the results with TestOnlyRollingHashType.
template <typename RollingHashType, typename TestOnlyRollingHashType>
void TestRollingHash() {
static RollingHashType hasher; // must be static.
TestOnlyRollingHashType test_hasher;
// Tests on this many ints.
constexpr size_t kDataSize = 1 << 23;
// We test collisions for the full 32-bit hash, and also
// for the hash masked-off by kMask, to ensure that it remains
// a good hash if we only take a subset of bits.
// Our main use case is using the number of bits required for kDomainSize.
constexpr uint32_t kMask = feature_domains::Domain::kDomainSize - 1;
// Allow this many collisions for the masked hash.
constexpr size_t kMaxNumMaskCollisions = 3;
constexpr size_t kNumWindowsSizes = 20;
// kDataSize ints: 0, 1, 2, ...
std::vector<uint32_t> data(kDataSize);
std::iota(data.begin(), data.end(), 0);
// Bitset is a bit faster for this test than a hash set.
using BS = std::bitset<(1ULL << 32)>;
// Allocate BS objects on heap, because they are too large for stack.
std::unique_ptr<BS> collisions_full(new BS());
std::unique_ptr<BS> collisions_mask(new BS());
for (size_t window_size = 1; window_size <= kNumWindowsSizes; ++window_size) {
hasher.Reset(window_size);
test_hasher.Reset(window_size);
// Clear all bitset bits (->reset(), not .reset()).
collisions_full->reset();
collisions_mask->reset();
// pipe all ints in `data` through the hasher, maintaining the window
// of `window_size` elements. Count hash collisions.
size_t num_collisions_full = 0, num_collisions_mask = 0;
for (size_t idx = 0; idx < data.size(); ++idx) {
uint32_t remove = idx >= window_size ? data[idx - window_size] : 0;
hasher.Update(data[idx], remove);
uint32_t hash = hasher.Hash();
num_collisions_full += collisions_full->test(hash);
num_collisions_mask += collisions_mask->test(hash & kMask);
collisions_full->set(hash);
collisions_mask->set(hash & kMask);
if (idx < 100000) {
// test_hasher is much more expensive, test only first few iterations.
test_hasher.Update(data[idx], remove);
EXPECT_EQ(hash, test_hasher.Hash());
}
}
EXPECT_EQ(num_collisions_full, 0);
EXPECT_LE(num_collisions_mask, kMaxNumMaskCollisions);
}
}
TEST(RollingHash, RollingHash) {
TestRollingHash<RollingHash, TestOnlyRollingHash>();
}
} // namespace
} // namespace centipede