pw_tokenizer: Remove legacy ELF handling
The format for storing strings in ELF files changed in 2020 (eb020a15).
This change removes code and tests for the unused legacy format and
updates function names and docstrings to better reflect the new format.
In particular, default_hash was renamed to c_hash, since it is used to
calculate the fixed-length C version of a token.
Change-Id: Iadfb58525679e1848b9ea7987b4693616069f002
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/107978
Pigweed-Auto-Submit: Wyatt Hepler <hepler@google.com>
Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com>
Reviewed-by: Anthony DiGirolamo <tonymd@google.com>
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 86a9927..87daa6a 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -1338,29 +1338,6 @@
them as an integer. This would be efficient and simple, but only support a small
number of arguments.
-Legacy tokenized string ELF format
-==================================
-The original version of ``pw_tokenizer`` stored tokenized stored as plain C
-strings in the ELF file instead of structured tokenized string entries. Strings
-in different domains were stored in different linker sections. The Python script
-that parsed the ELF file would re-calculate the tokens.
-
-In the current version of ``pw_tokenizer``, tokenized strings are stored in a
-structured entry containing a token, domain, and length-delimited string. This
-has several advantages over the legacy format:
-
-* The Python script does not have to recalculate the token, so any hash
- algorithm may be used in the firmware.
-* In C++, the tokenization hash no longer has a length limitation.
-* Strings with null terminators in them are properly handled.
-* Only one linker section is required in the linker script, instead of a
- separate section for each domain.
-
-To migrate to the new format, all that is required is update the linker sections
-to match those in ``pw_tokenizer_linker_sections.ld``. Replace all
-``pw_tokenized.<DOMAIN>`` sections with one ``pw_tokenizer.entries`` section.
-The Python tooling continues to support the legacy tokenized string ELF format.
-
Compatibility
=============
* C11
diff --git a/pw_tokenizer/py/BUILD.gn b/pw_tokenizer/py/BUILD.gn
index 4c70a28..ceaa749 100644
--- a/pw_tokenizer/py/BUILD.gn
+++ b/pw_tokenizer/py/BUILD.gn
@@ -60,7 +60,6 @@
inputs = [
"elf_reader_test_binary.elf",
"example_binary_with_tokenized_strings.elf",
- "example_legacy_binary_with_tokenized_strings.elf",
]
proto_library = "..:proto"
pylintrc = "$dir_pigweed/.pylintrc"
diff --git a/pw_tokenizer/py/database_test.py b/pw_tokenizer/py/database_test.py
index adc0e36..4d80946 100755
--- a/pw_tokenizer/py/database_test.py
+++ b/pw_tokenizer/py/database_test.py
@@ -36,8 +36,6 @@
#
TOKENIZED_ENTRIES_ELF = Path(
__file__).parent / 'example_binary_with_tokenized_strings.elf'
-LEGACY_PLAIN_STRING_ELF = Path(
- __file__).parent / 'example_legacy_binary_with_tokenized_strings.elf'
CSV_DEFAULT_DOMAIN = '''\
00000000, ,""
@@ -290,36 +288,6 @@
self._csv.read_text().splitlines())
-class LegacyDatabaseCommandLineTest(DatabaseCommandLineTest):
- """Test an ELF with the legacy plain string storage format."""
- def setUp(self) -> None:
- super().setUp()
- self._elf = LEGACY_PLAIN_STRING_ELF
-
- # The legacy approach for storing tokenized strings in an ELF always
- # adds an entry for "", even if the empty string was never tokenized.
- self._csv_test_domain = '00000000, ,""\n' + CSV_TEST_DOMAIN
-
- @mock.patch('sys.stdout', new_callable=_mock_output)
- def test_report(self, mock_stdout):
- run_cli('report', self._elf)
-
- report = EXPECTED_REPORT[str(TOKENIZED_ENTRIES_ELF)].copy()
-
- # Count the implicitly added "" entry in TEST_DOMAIN.
- report['TEST_DOMAIN']['present_entries'] += 1
- report['TEST_DOMAIN']['present_size_bytes'] += 1
- report['TEST_DOMAIN']['total_entries'] += 1
- report['TEST_DOMAIN']['total_size_bytes'] += 1
-
- # Rename "" to the legacy name "default"
- report['default'] = report['']
- del report['']
-
- self.assertEqual({str(LEGACY_PLAIN_STRING_ELF): report},
- json.loads(mock_stdout.buffer.getvalue()))
-
-
class TestDirectoryDatabaseCommandLine(unittest.TestCase):
"""Tests the directory database command line interface."""
def setUp(self) -> None:
diff --git a/pw_tokenizer/py/detokenize_test.py b/pw_tokenizer/py/detokenize_test.py
index 1406296..07ca677 100755
--- a/pw_tokenizer/py/detokenize_test.py
+++ b/pw_tokenizer/py/detokenize_test.py
@@ -508,11 +508,11 @@
RECURSION_STRING = f'The secret message is "{JELLO.decode()}"'
RECURSION = b'$' + base64.b64encode(
- struct.pack('I', tokens.default_hash(RECURSION_STRING)))
+ struct.pack('I', tokens.c_hash(RECURSION_STRING)))
RECURSION_STRING_2 = f"'{RECURSION.decode()}', said the spy."
RECURSION_2 = b'$' + base64.b64encode(
- struct.pack('I', tokens.default_hash(RECURSION_STRING_2)))
+ struct.pack('I', tokens.c_hash(RECURSION_STRING_2)))
TEST_CASES = (
(b'', b''),
@@ -539,7 +539,7 @@
db = database.load_token_database(
io.BytesIO(ELF_WITH_TOKENIZER_SECTIONS))
db.add(
- tokens.TokenizedStringEntry(tokens.default_hash(s), s)
+ tokens.TokenizedStringEntry(tokens.c_hash(s), s)
for s in [self.RECURSION_STRING, self.RECURSION_STRING_2])
self.detok = detokenize.Detokenizer(db)
diff --git a/pw_tokenizer/py/example_legacy_binary_with_tokenized_strings.elf b/pw_tokenizer/py/example_legacy_binary_with_tokenized_strings.elf
deleted file mode 100755
index 0fe2e60..0000000
--- a/pw_tokenizer/py/example_legacy_binary_with_tokenized_strings.elf
+++ /dev/null
Binary files differ
diff --git a/pw_tokenizer/py/generate_hash_test_data.py b/pw_tokenizer/py/generate_hash_test_data.py
index 373c82e..a10907c 100755
--- a/pw_tokenizer/py/generate_hash_test_data.py
+++ b/pw_tokenizer/py/generate_hash_test_data.py
@@ -101,8 +101,7 @@
return _TEST_CASE.format(str=escaped_str,
string_length=len(data),
hash_length=hash_length,
- hash=tokens.pw_tokenizer_65599_hash(
- data, hash_length),
+ hash=tokens.c_hash(data, hash_length),
macro=HASH_MACRO.format(hash_length))
diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index 6f156be..b7b83ac 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py
@@ -56,9 +56,6 @@
_TOKENIZED_ENTRY_SECTIONS = re.compile(
r'^\.pw_tokenizer.entries(?:\.[_\d]+)?$')
-_LEGACY_STRING_SECTIONS = re.compile(
- r'^\.pw_tokenized\.(?P<domain>[^.]+)(?:\.\d+)?$')
-
_ERROR_HANDLER = 'surrogateescape' # How to deal with UTF-8 decoding errors
@@ -101,21 +98,6 @@
yield entry
-def _read_tokenized_strings(sections: Dict[str, bytes],
- domain: Pattern[str]) -> Iterator[tokens.Database]:
- # Legacy ELF files used "default" as the default domain instead of "". Remap
- # the default if necessary.
- if domain.pattern == tokens.DEFAULT_DOMAIN:
- domain = re.compile('default')
-
- for section, data in sections.items():
- match = _LEGACY_STRING_SECTIONS.match(section)
- if match and domain.match(match.group('domain')):
- yield tokens.Database.from_strings(
- (s.decode(errors=_ERROR_HANDLER) for s in data.split(b'\0')),
- match.group('domain'))
-
-
def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database:
"""Reads the tokenized strings from an elf_reader.Elf or ELF file object."""
_LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf)
@@ -127,12 +109,6 @@
if section_data is not None:
return tokens.Database(_read_tokenized_entries(section_data, domain))
- # Read legacy null-terminated string entries.
- sections = reader.dump_sections(_LEGACY_STRING_SECTIONS)
- if sections:
- return tokens.Database.merged(
- *_read_tokenized_strings(sections, domain))
-
return tokens.Database([])
@@ -144,11 +120,6 @@
yield from frozenset(
e.domain
for e in _read_tokenized_entries(section_data, re.compile('.*')))
- else: # Check for the legacy domain sections
- for section in reader.sections:
- match = _LEGACY_STRING_SECTIONS.match(section.name)
- if match:
- yield match.group('domain')
def read_tokenizer_metadata(elf) -> Dict[str, int]:
@@ -169,11 +140,8 @@
def _database_from_strings(strings: List[str]) -> tokens.Database:
"""Generates a C and C++ compatible database from untokenized strings."""
- # Generate a C compatible database from the fixed length hash.
- c_db = tokens.Database.from_strings(
- strings,
- tokenize=lambda string: tokens.pw_tokenizer_65599_hash(
- string, tokens.DEFAULT_C_HASH_LENGTH))
+ # Generate a C-compatible database from the fixed length hash.
+ c_db = tokens.Database.from_strings(strings, tokenize=tokens.c_hash)
# Generate a C++ compatible database by allowing the hash to follow the
# string length.
diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index 3da26cd..a485a06 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py
@@ -32,9 +32,9 @@
DATE_FORMAT = '%Y-%m-%d'
DEFAULT_DOMAIN = ''
-# The default hash length to use. This value only applies when hashing strings
-# from a legacy-style ELF with plain strings. New tokenized string entries
-# include the token alongside the string.
+# The default hash length to use for C-style hashes. This value only applies
+# when manually hashing strings to recreate token calculations in C. The C++
+# hash function does not have a maximum length.
#
# This MUST match the default value of PW_TOKENIZER_CFG_C_HASH_LENGTH in
# pw_tokenizer/public/pw_tokenizer/config.h.
@@ -50,11 +50,13 @@
def pw_tokenizer_65599_hash(string: Union[str, bytes],
+ *,
hash_length: int = None) -> int:
- """Hashes the provided string.
+ """Hashes the string with the hash function used to generate tokens in C++.
- This hash function is only used when adding tokens from legacy-style
- tokenized strings in an ELF, which do not include the token.
+ This hash function is used calculate tokens from strings in Python. It is
+ not used when extracting tokens from an ELF, since the token is stored in
+ the ELF as part of tokenization.
"""
hash_value = len(string)
coefficient = TOKENIZER_HASH_CONSTANT
@@ -66,8 +68,10 @@
return hash_value
-def default_hash(string: Union[str, bytes]) -> int:
- return pw_tokenizer_65599_hash(string, DEFAULT_C_HASH_LENGTH)
+def c_hash(string: Union[str, bytes],
+ hash_length: int = DEFAULT_C_HASH_LENGTH) -> int:
+ """Hashes the string with the hash function used in C."""
+ return pw_tokenizer_65599_hash(string, hash_length=hash_length)
class _EntryKey(NamedTuple):
@@ -129,10 +133,11 @@
@classmethod
def from_strings(
- cls,
- strings: Iterable[str],
- domain: str = DEFAULT_DOMAIN,
- tokenize: Callable[[str], int] = default_hash) -> 'Database':
+ cls,
+ strings: Iterable[str],
+ domain: str = DEFAULT_DOMAIN,
+ tokenize: Callable[[str],
+ int] = pw_tokenizer_65599_hash) -> 'Database':
"""Creates a Database from an iterable of strings."""
return cls((TokenizedStringEntry(tokenize(string), string, domain)
for string in strings))
diff --git a/pw_tokenizer/py/tokens_test.py b/pw_tokenizer/py/tokens_test.py
index 8afec72..f6abed6 100755
--- a/pw_tokenizer/py/tokens_test.py
+++ b/pw_tokenizer/py/tokens_test.py
@@ -23,9 +23,8 @@
from typing import Iterator
import unittest
-from pw_tokenizer import tokens
-from pw_tokenizer import database
-from pw_tokenizer.tokens import default_hash, _LOG
+from pw_tokenizer import database, tokens
+from pw_tokenizer.tokens import c_hash, _LOG
CSV_DATABASE = '''\
00000000,2019-06-10,""
@@ -161,7 +160,7 @@
def _entries(*strings: str) -> Iterator[tokens.TokenizedStringEntry]:
for string in strings:
- yield tokens.TokenizedStringEntry(default_hash(string), string)
+ yield tokens.TokenizedStringEntry(c_hash(string), string)
class TokenDatabaseTest(unittest.TestCase):
@@ -222,8 +221,8 @@
self.assertEqual(answer.string, 'The answer: "%s"')
def test_collisions(self) -> None:
- hash_1 = tokens.pw_tokenizer_65599_hash('o000', 96)
- hash_2 = tokens.pw_tokenizer_65599_hash('0Q1Q', 96)
+ hash_1 = tokens.c_hash('o000', 96)
+ hash_2 = tokens.c_hash('0Q1Q', 96)
self.assertEqual(hash_1, hash_2)
db = tokens.Database.from_strings(['o000', '0Q1Q'])
@@ -399,29 +398,26 @@
db.mark_removed(_entries('apples', 'oranges', 'pears'), date_1)
- self.assertEqual(
- db.token_to_entries[default_hash('MILK')][0].date_removed, date_1)
- self.assertEqual(
- db.token_to_entries[default_hash('CHEESE')][0].date_removed,
- date_1)
+ self.assertEqual(db.token_to_entries[c_hash('MILK')][0].date_removed,
+ date_1)
+ self.assertEqual(db.token_to_entries[c_hash('CHEESE')][0].date_removed,
+ date_1)
now = datetime.now()
db.mark_removed(_entries('MILK', 'CHEESE', 'pears'))
# New strings are not added or re-added in mark_removed().
self.assertGreaterEqual(
- db.token_to_entries[default_hash('MILK')][0].date_removed, date_1)
+ db.token_to_entries[c_hash('MILK')][0].date_removed, date_1)
self.assertGreaterEqual(
- db.token_to_entries[default_hash('CHEESE')][0].date_removed,
- date_1)
+ db.token_to_entries[c_hash('CHEESE')][0].date_removed, date_1)
# These strings were removed.
self.assertGreaterEqual(
- db.token_to_entries[default_hash('apples')][0].date_removed, now)
+ db.token_to_entries[c_hash('apples')][0].date_removed, now)
self.assertGreaterEqual(
- db.token_to_entries[default_hash('oranges')][0].date_removed, now)
- self.assertIsNone(
- db.token_to_entries[default_hash('pears')][0].date_removed)
+ db.token_to_entries[c_hash('oranges')][0].date_removed, now)
+ self.assertIsNone(db.token_to_entries[c_hash('pears')][0].date_removed)
def test_add(self) -> None:
db = tokens.Database()