| // Copyright 2023 The Pigweed Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| // use this file except in compliance with the License. You may obtain a copy of |
| // the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| // License for the specific language governing permissions and limitations under |
| // the License. |
| |
| //! # pw_tokenizer_core |
| //! |
| //! This crate provides the core functionality of calculating a token hash |
| //! for a string or byte sequence. This is intended to provide a minimal core |
| //! for both the main `pw_tokenizer` and `pw_tokenizer_macro` crates. Users |
| //! should prefer depending `pw_tokenizer` instead of this crate. |
| #![cfg_attr(not(feature = "std"), no_std)] |
| |
| pub const HASH_CONSTANT: u32 = 65599u32; |
| |
| /// Hasher is used to calculate a token's hash value. |
| /// |
| /// The hash state is encapsulated in the `Hasher` to allow the hashing of |
| /// multi-part strings where the concatenated value can not be know at macro |
| /// expansion time. All functions are `const`. |
| pub struct Hasher { |
| coefficient: u32, |
| hash: u32, |
| bytes_hashed: usize, |
| hash_len: usize, |
| } |
| |
| impl Hasher { |
| /// Create a new `Hasher` |
| /// |
| /// `data_len` is the total length of the data to be hashed. `hash_len` |
| /// is the number of bytes of data to be used in calculating the hash. |
| /// `data_len` is used to seed the hash while `hash_len` controls how many |
| /// bytes are hashed. |
| pub const fn new(data_len: usize, hash_len: usize) -> Self { |
| { |
| Self { |
| coefficient: HASH_CONSTANT, |
| hash: data_len as u32, |
| bytes_hashed: 0, |
| hash_len, |
| } |
| } |
| } |
| |
| /// Processes `bytes` and updates hash state. |
| /// |
| /// Consumes `self` and returns a [`Hasher`] with the updated state. |
| pub const fn process_bytes(mut self, bytes: &[u8]) -> Self { |
| let bytes_left = self.hash_len - self.bytes_hashed; |
| |
| let bytes = if bytes.len() > bytes_left { |
| bytes.split_at(bytes_left).0 |
| } else { |
| bytes |
| }; |
| |
| // For loops are not allowed in const functions. |
| let mut i = 0; |
| while i < bytes.len() { |
| // We call `u32::wrapping_*` instead of using `core::num::Wrapping` to |
| // avoid using traits in a const function. |
| self.hash = self |
| .hash |
| .wrapping_add(self.coefficient.wrapping_mul(bytes[i] as u32)); |
| self.coefficient = self.coefficient.wrapping_mul(HASH_CONSTANT); |
| i += 1; |
| } |
| self.bytes_hashed += i; |
| |
| self |
| } |
| |
| /// Consume `self` and return the hash. |
| pub const fn hash(self) -> u32 { |
| self.hash |
| } |
| } |
| /// Calculate the hash for a sequence of bytes. |
| /// |
| /// ``` |
| /// use pw_tokenizer_core::hash_bytes; |
| /// |
| /// let hash = hash_bytes(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07]); |
| /// assert_eq!(hash, 0x9e624642); |
| /// ``` |
| pub const fn hash_bytes(bytes: &[u8]) -> u32 { |
| hash_bytes_fixed(bytes, bytes.len()) |
| } |
| |
| /// Calculate the hash for a sequence of bytes, examining at most `len` bytes. |
| /// |
| /// ``` |
| /// use pw_tokenizer_core::hash_bytes_fixed; |
| /// |
| /// let hash = hash_bytes_fixed(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07], 4); |
| /// assert_eq!(hash, 0x92c5d2ac); |
| /// ``` |
| pub const fn hash_bytes_fixed(bytes: &[u8], len: usize) -> u32 { |
| Hasher::new(bytes.len(), len).process_bytes(bytes).hash() |
| } |
| |
| /// Calculate the hash for a string. |
| /// |
| /// ``` |
| /// use pw_tokenizer_core::hash_string; |
| /// |
| /// let hash = hash_string("I 💖 Pigweed"); |
| /// assert_eq!(hash, 0xe318d1b3); |
| /// ``` |
| pub const fn hash_string(s: &str) -> u32 { |
| hash_bytes(s.as_bytes()) |
| } |
| |
| pub const TOKENIZER_ENTRY_MAGIC: u32 = 0xBAA98DEE; |
| |
| #[cfg(test)] |
| mod tests { |
| use super::{hash_bytes_fixed, Hasher}; |
| |
| struct TestCase { |
| string: &'static [u8], |
| hash_length: usize, |
| hash: u32, |
| } |
| |
| // Generated file defines `test_cases()`. |
| include!("pw_tokenizer_core_test_cases.rs"); |
| |
| #[test] |
| fn hash_bytes_fixed_generates_correct_hash() { |
| for test in test_cases() { |
| let hash = hash_bytes_fixed(test.string, test.hash_length); |
| assert_eq!( |
| hash, test.hash, |
| "{:08x} != {:08x} string: {:x?} hash_size: {}", |
| hash, test.hash, test.string, test.hash_length |
| ); |
| } |
| } |
| |
| #[test] |
| fn multi_part_data_generates_correct_hash() { |
| for test in test_cases() { |
| let mut hasher = Hasher::new(test.string.len(), test.hash_length); |
| // Break each test string into 8 byte chunks and feed them to the |
| // hasher separately. |
| for chunk in test.string.chunks(8) { |
| hasher = hasher.process_bytes(chunk); |
| } |
| let hash = hasher.hash(); |
| assert_eq!( |
| hash, test.hash, |
| "{:08x} != {:08x} string: {:x?} hash_size: {}", |
| hash, test.hash, test.string, test.hash_length |
| ); |
| } |
| } |
| } |