pw_tokenizer: Variable length hash
Renames pw_tokenizer_65599_fixed_length_hash to pw_tokenizer_65599_hash
and changes the function to do variable length tokenization by default.
This makes it easier to generate unbounded length tokens in addition to
fixed-length tokens.
Change-Id: I7dac54fbf39efa523708e9c1db49a28859078240
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/78881
Reviewed-by: Wyatt Hepler <hepler@google.com>
Pigweed-Auto-Submit: Armando Montanez <amontanez@google.com>
Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com>
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 78f8e74..5a71d5f 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -410,6 +410,23 @@
.. autofunction:: pw_tokenizer.encode.encode_token_and_args
+This function requires a string's token is already calculated. Typically these
+tokens are provided by a database, but they can be manually created using the
+tokenizer hash.
+
+.. autofunction:: pw_tokenizer.tokens.pw_tokenizer_65599_hash
+
+This is particularly useful for offline token database generation in cases where
+tokenized strings in a binary cannot be embedded as parsable pw_tokenizer
+entries.
+
+.. note::
+ In C, the hash length of a string has a fixed limit controlled by
+ ``PW_TOKENIZER_CFG_C_HASH_LENGTH``. To match tokens produced by C (as opposed
+ to C++) code, ``pw_tokenizer_65599_hash()`` should be called with a matching
+ hash length limit. When creating an offline database, it's a good idea to
+ generate tokens for both, and merge the databases.
+
Encoding
--------
The token is a 32-bit hash calculated during compilation. The string is encoded
diff --git a/pw_tokenizer/py/generate_hash_test_data.py b/pw_tokenizer/py/generate_hash_test_data.py
index 586617f..373c82e 100755
--- a/pw_tokenizer/py/generate_hash_test_data.py
+++ b/pw_tokenizer/py/generate_hash_test_data.py
@@ -101,7 +101,7 @@
return _TEST_CASE.format(str=escaped_str,
string_length=len(data),
hash_length=hash_length,
- hash=tokens.pw_tokenizer_65599_fixed_length_hash(
+ hash=tokens.pw_tokenizer_65599_hash(
data, hash_length),
macro=HASH_MACRO.format(hash_length))
diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index 663935e..f01c00a 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py
@@ -45,8 +45,8 @@
return char if isinstance(char, int) else ord(char)
-def pw_tokenizer_65599_fixed_length_hash(string: Union[str, bytes],
- hash_length: int) -> int:
+def pw_tokenizer_65599_hash(string: Union[str, bytes],
+ hash_length: int = None) -> int:
"""Hashes the provided string.
This hash function is only used when adding tokens from legacy-style
@@ -63,7 +63,7 @@
def default_hash(string: Union[str, bytes]) -> int:
- return pw_tokenizer_65599_fixed_length_hash(string, DEFAULT_C_HASH_LENGTH)
+ return pw_tokenizer_65599_hash(string, DEFAULT_C_HASH_LENGTH)
class _EntryKey(NamedTuple):
diff --git a/pw_tokenizer/py/tokens_test.py b/pw_tokenizer/py/tokens_test.py
index 1f67d5f..c205762 100755
--- a/pw_tokenizer/py/tokens_test.py
+++ b/pw_tokenizer/py/tokens_test.py
@@ -158,8 +158,8 @@
self.assertEqual(answer.string, 'The answer: "%s"')
def test_collisions(self):
- hash_1 = tokens.pw_tokenizer_65599_fixed_length_hash('o000', 96)
- hash_2 = tokens.pw_tokenizer_65599_fixed_length_hash('0Q1Q', 96)
+ hash_1 = tokens.pw_tokenizer_65599_hash('o000', 96)
+ hash_2 = tokens.pw_tokenizer_65599_hash('0Q1Q', 96)
self.assertEqual(hash_1, hash_2)
db = tokens.Database.from_strings(['o000', '0Q1Q'])