pw_tokenizer: Variable length hash Renames pw_tokenizer_65599_fixed_length_hash to pw_tokenizer_65599_hash and changes the function to do variable length tokenization by default. This makes it easier to generate unbounded length tokens in addition to fixed-length tokens. Change-Id: I7dac54fbf39efa523708e9c1db49a28859078240 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/78881 Reviewed-by: Wyatt Hepler <hepler@google.com> Pigweed-Auto-Submit: Armando Montanez <amontanez@google.com> Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com>

commit: 36a1ef7322c44fc5a9566783674a435a49897abf [log] [tgz]
author: Armando Montanez <amontanez@google.com> Thu Jan 13 17:23:45 2022 -0800
committer: CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com> Sat Jan 15 00:14:59 2022 +0000
tree: bb028c57c249553bc0ab7e0107af8cff997a556f
parent: 49f21314ece3a2346fb5adf773ce123221e2712a [diff]
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 78f8e74..5a71d5f 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst

@@ -410,6 +410,23 @@
 
 .. autofunction:: pw_tokenizer.encode.encode_token_and_args
 
+This function requires a string's token is already calculated. Typically these
+tokens are provided by a database, but they can be manually created using the
+tokenizer hash.
+
+.. autofunction:: pw_tokenizer.tokens.pw_tokenizer_65599_hash
+
+This is particularly useful for offline token database generation in cases where
+tokenized strings in a binary cannot be embedded as parsable pw_tokenizer
+entries.
+
+.. note::
+  In C, the hash length of a string has a fixed limit controlled by
+  ``PW_TOKENIZER_CFG_C_HASH_LENGTH``. To match tokens produced by C (as opposed
+  to C++) code, ``pw_tokenizer_65599_hash()`` should be called with a matching
+  hash length limit. When creating an offline database, it's a good idea to
+  generate tokens for both, and merge the databases.
+
 Encoding
 --------
 The token is a 32-bit hash calculated during compilation. The string is encoded

diff --git a/pw_tokenizer/py/generate_hash_test_data.py b/pw_tokenizer/py/generate_hash_test_data.py
index 586617f..373c82e 100755
--- a/pw_tokenizer/py/generate_hash_test_data.py
+++ b/pw_tokenizer/py/generate_hash_test_data.py

@@ -101,7 +101,7 @@
     return _TEST_CASE.format(str=escaped_str,
                              string_length=len(data),
                              hash_length=hash_length,
-                             hash=tokens.pw_tokenizer_65599_fixed_length_hash(
+                             hash=tokens.pw_tokenizer_65599_hash(
                                  data, hash_length),
                              macro=HASH_MACRO.format(hash_length))
 

diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index 663935e..f01c00a 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py

@@ -45,8 +45,8 @@
     return char if isinstance(char, int) else ord(char)
 
 
-def pw_tokenizer_65599_fixed_length_hash(string: Union[str, bytes],
-                                         hash_length: int) -> int:
+def pw_tokenizer_65599_hash(string: Union[str, bytes],
+                            hash_length: int = None) -> int:
     """Hashes the provided string.
 
     This hash function is only used when adding tokens from legacy-style
@@ -63,7 +63,7 @@
 
 
 def default_hash(string: Union[str, bytes]) -> int:
-    return pw_tokenizer_65599_fixed_length_hash(string, DEFAULT_C_HASH_LENGTH)
+    return pw_tokenizer_65599_hash(string, DEFAULT_C_HASH_LENGTH)
 
 
 class _EntryKey(NamedTuple):

diff --git a/pw_tokenizer/py/tokens_test.py b/pw_tokenizer/py/tokens_test.py
index 1f67d5f..c205762 100755
--- a/pw_tokenizer/py/tokens_test.py
+++ b/pw_tokenizer/py/tokens_test.py

@@ -158,8 +158,8 @@
         self.assertEqual(answer.string, 'The answer: "%s"')
 
     def test_collisions(self):
-        hash_1 = tokens.pw_tokenizer_65599_fixed_length_hash('o000', 96)
-        hash_2 = tokens.pw_tokenizer_65599_fixed_length_hash('0Q1Q', 96)
+        hash_1 = tokens.pw_tokenizer_65599_hash('o000', 96)
+        hash_2 = tokens.pw_tokenizer_65599_hash('0Q1Q', 96)
         self.assertEqual(hash_1, hash_2)
 
         db = tokens.Database.from_strings(['o000', '0Q1Q'])
commit	36a1ef7322c44fc5a9566783674a435a49897abf	[log] [tgz]
author	Armando Montanez <amontanez@google.com>	Thu Jan 13 17:23:45 2022 -0800
committer	CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com>	Sat Jan 15 00:14:59 2022 +0000
tree	bb028c57c249553bc0ab7e0107af8cff997a556f
parent	49f21314ece3a2346fb5adf773ce123221e2712a [diff]