Use crc32 with union and rotate in CombineRawImpl. The hash quality ends up depending on the locations of bits in the values working well in conjunction with the polynomial used by crc32. We minimized probe benchmark geomean across the 64 possible rotation values (we see ~75% geomean improvement). Benchmarks show improvements in lookup latency and hash quality. Loadtests are neutral or leaning positive. ASM diff x86: https://godbolt.org/z/jvG1aY31e ASM diff ARM: https://godbolt.org/z/dPdP6d6d8 ASM diff x86 hashtable lookup (simplified): https://godbolt.org/z/E9ehME647 PiperOrigin-RevId: 868281226 Change-Id: Ifc3bf79de50f15d3d74170911013ca19e73ee240
diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h index a123ebd..8e27bbb 100644 --- a/absl/container/internal/raw_hash_set.h +++ b/absl/container/internal/raw_hash_set.h
@@ -441,24 +441,16 @@ // It is big enough to ensure non-determinism of iteration order. // We store the seed inside a uint64_t together with size and other metadata. // Using 16 bits allows us to save one `and` instruction in H1 (we use - // sign-extended move instead of mov+and). + // zero-extended move instead of mov+and). When absl::Hash is inlined, it can + // also have lower latency knowing that the high bits of the seed are zero. static constexpr size_t kBitCount = 16; - static constexpr size_t kSignBit = uint64_t{1} << (kBitCount - 1); // Returns the seed for the table. - size_t seed() const { - // We use a sign-extended load to ensure high bits are non-zero. - int16_t seed_signed = absl::bit_cast<int16_t>(seed_); - auto seed_sign_extended = - static_cast<std::make_signed_t<size_t>>(seed_signed); - return absl::bit_cast<size_t>(seed_sign_extended); - } + size_t seed() const { return seed_; } private: friend class HashtableSize; - explicit PerTableSeed(uint16_t seed) : seed_(seed) { - ABSL_SWISSTABLE_ASSERT((seed & kSignBit) != 0 || seed == 0); - } + explicit PerTableSeed(uint16_t seed) : seed_(seed) {} // The most significant bit of the seed is always 1 when there is a non-zero // seed. This way, when sign-extended the seed has non-zero high bits. @@ -496,10 +488,10 @@ // We need to use a constant seed when the table is sampled so that sampled // hashes use the same seed and can e.g. identify stuck bits accurately. - void set_sampled_seed() { set_seed(PerTableSeed::kSignBit); } + void set_sampled_seed() { set_seed((std::numeric_limits<uint16_t>::max)()); } bool is_sampled_seed() const { - return (data_ & kSeedMask) == PerTableSeed::kSignBit; + return (data_ & kSeedMask) == (std::numeric_limits<uint16_t>::max)(); } // Returns true if the table has infoz. @@ -516,9 +508,7 @@ static uint16_t NextSeed(); private: - void set_seed(uint16_t seed) { - data_ = (data_ & ~kSeedMask) | (seed | PerTableSeed::kSignBit); - } + void set_seed(uint16_t seed) { data_ = (data_ & ~kSeedMask) | seed; } static constexpr size_t kSizeShift = 64 - kSizeBitCount; static constexpr uint64_t kSizeOneNoMetadata = uint64_t{1} << kSizeShift; static constexpr uint64_t kMetadataMask = kSizeOneNoMetadata - 1;
diff --git a/absl/container/internal/raw_hash_set_test.cc b/absl/container/internal/raw_hash_set_test.cc index 0b5ad8f..7136691 100644 --- a/absl/container/internal/raw_hash_set_test.cc +++ b/absl/container/internal/raw_hash_set_test.cc
@@ -4168,16 +4168,6 @@ EXPECT_EQ(t.capacity(), kTargetCapacity); } -// Test that after calling generate_new_seed(), the high bits of the returned -// seed are non-zero. -TEST(PerTableSeed, HighBitsAreNonZero) { - HashtableSize hs(no_seed_empty_tag_t{}); - for (int i = 0; i < 100; ++i) { - hs.generate_new_seed(); - ASSERT_GT(hs.seed().seed() >> 16, 0); - } -} - } // namespace } // namespace container_internal ABSL_NAMESPACE_END
diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h index ea04f47..4fe474e 100644 --- a/absl/hash/internal/hash.h +++ b/absl/hash/internal/hash.h
@@ -1060,10 +1060,46 @@ return mem0 | mem1; } +#ifdef ABSL_HASH_INTERNAL_HAS_CRC32 + +ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t CombineRawImpl(uint64_t state, + uint64_t value) { + // We use a union to access the high and low 32 bits of the state. + union { + uint64_t u64; + struct { +#ifdef ABSL_IS_LITTLE_ENDIAN + uint32_t low, high; +#else // big endian + uint32_t high, low; +#endif + } u32s; + } s; + s.u64 = state; + // The general idea here is to do two CRC32 operations in parallel using the + // low and high 32 bits of state as CRC states. Note that: (1) when absl::Hash + // is inlined into swisstable lookups, we know that the seed's high bits are + // zero so s.u32s.high is available immediately. (2) We chose to rotate value + // right by 45 for the low CRC because that minimizes the probe benchmark + // geomean across the 64 possible rotations. (3) The union makes it easy for + // the compiler to understand that the high and low CRC states are independent + // from each other so that when CombineRawImpl is repeated (e.g. for + // std::pair<size_t, size_t>), the CRC chains can run in parallel. We + // originally tried using bswaps rather than shifting by 32 bits (to get from + // high to low bits) because bswap is one byte smaller in code size, but the + // compiler couldn't understand that the CRC chains were independent. + s.u32s.high = + static_cast<uint32_t>(ABSL_HASH_INTERNAL_CRC32_U64(s.u32s.high, value)); + s.u32s.low = static_cast<uint32_t>( + ABSL_HASH_INTERNAL_CRC32_U64(s.u32s.low, absl::rotr(value, 45))); + return s.u64; +} +#else // ABSL_HASH_INTERNAL_HAS_CRC32 ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t CombineRawImpl(uint64_t state, uint64_t value) { return Mix(state ^ value, kMul); } +#endif // ABSL_HASH_INTERNAL_HAS_CRC32 // Slow dispatch path for calls to CombineContiguousImpl with a size argument // larger than inlined size. Has the same effect as calling