blob: fefa3abf53c67efd058f4f7c9090855f285b5842 [file] [log] [blame] [edit]
// Copyright 2023 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
//! # pw_tokenizer_core
//!
//! This crate provides the core functionality of calculating a token hash
//! for a string or byte sequence. This is intended to provide a minimal core
//! for both the main `pw_tokenizer` and `pw_tokenizer_macro` crates. Users
//! should prefer depending `pw_tokenizer` instead of this crate.
#![cfg_attr(not(feature = "std"), no_std)]
pub const HASH_CONSTANT: u32 = 65599u32;
/// Hasher is used to calculate a token's hash value.
///
/// The hash state is encapsulated in the `Hasher` to allow the hashing of
/// multi-part strings where the concatenated value can not be know at macro
/// expansion time. All functions are `const`.
pub struct Hasher {
coefficient: u32,
hash: u32,
bytes_hashed: usize,
hash_len: usize,
}
impl Hasher {
/// Create a new `Hasher`
///
/// `data_len` is the total length of the data to be hashed. `hash_len`
/// is the number of bytes of data to be used in calculating the hash.
/// `data_len` is used to seed the hash while `hash_len` controls how many
/// bytes are hashed.
pub const fn new(data_len: usize, hash_len: usize) -> Self {
{
Self {
coefficient: HASH_CONSTANT,
hash: data_len as u32,
bytes_hashed: 0,
hash_len,
}
}
}
/// Processes `bytes` and updates hash state.
///
/// Consumes `self` and returns a [`Hasher`] with the updated state.
pub const fn process_bytes(mut self, bytes: &[u8]) -> Self {
let bytes_left = self.hash_len - self.bytes_hashed;
let bytes = if bytes.len() > bytes_left {
bytes.split_at(bytes_left).0
} else {
bytes
};
// For loops are not allowed in const functions.
let mut i = 0;
while i < bytes.len() {
// We call `u32::wrapping_*` instead of using `core::num::Wrapping` to
// avoid using traits in a const function.
self.hash = self
.hash
.wrapping_add(self.coefficient.wrapping_mul(bytes[i] as u32));
self.coefficient = self.coefficient.wrapping_mul(HASH_CONSTANT);
i += 1;
}
self.bytes_hashed += i;
self
}
/// Consume `self` and return the hash.
pub const fn hash(self) -> u32 {
self.hash
}
}
/// Calculate the hash for a sequence of bytes.
///
/// ```
/// use pw_tokenizer_core::hash_bytes;
///
/// let hash = hash_bytes(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07]);
/// assert_eq!(hash, 0x9e624642);
/// ```
pub const fn hash_bytes(bytes: &[u8]) -> u32 {
hash_bytes_fixed(bytes, bytes.len())
}
/// Calculate the hash for a sequence of bytes, examining at most `len` bytes.
///
/// ```
/// use pw_tokenizer_core::hash_bytes_fixed;
///
/// let hash = hash_bytes_fixed(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07], 4);
/// assert_eq!(hash, 0x92c5d2ac);
/// ```
pub const fn hash_bytes_fixed(bytes: &[u8], len: usize) -> u32 {
Hasher::new(bytes.len(), len).process_bytes(bytes).hash()
}
/// Calculate the hash for a string.
///
/// ```
/// use pw_tokenizer_core::hash_string;
///
/// let hash = hash_string("I 💖 Pigweed");
/// assert_eq!(hash, 0xe318d1b3);
/// ```
pub const fn hash_string(s: &str) -> u32 {
hash_bytes(s.as_bytes())
}
pub const TOKENIZER_ENTRY_MAGIC: u32 = 0xBAA98DEE;
#[cfg(test)]
mod tests {
use super::{hash_bytes_fixed, Hasher};
struct TestCase {
string: &'static [u8],
hash_length: usize,
hash: u32,
}
// Generated file defines `test_cases()`.
include!("pw_tokenizer_core_test_cases.rs");
#[test]
fn hash_bytes_fixed_generates_correct_hash() {
for test in test_cases() {
let hash = hash_bytes_fixed(test.string, test.hash_length);
assert_eq!(
hash, test.hash,
"{:08x} != {:08x} string: {:x?} hash_size: {}",
hash, test.hash, test.string, test.hash_length
);
}
}
#[test]
fn multi_part_data_generates_correct_hash() {
for test in test_cases() {
let mut hasher = Hasher::new(test.string.len(), test.hash_length);
// Break each test string into 8 byte chunks and feed them to the
// hasher separately.
for chunk in test.string.chunks(8) {
hasher = hasher.process_bytes(chunk);
}
let hash = hasher.hash();
assert_eq!(
hash, test.hash,
"{:08x} != {:08x} string: {:x?} hash_size: {}",
hash, test.hash, test.string, test.hash_length
);
}
}
}