pw_tokenizer: Database from JSON
Adds JSON support to the token database creation tool to facilitate
side-band database generation.
Change-Id: I9b7e55d79e45b742882b6df8eb193cba085d4659
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/78100
Reviewed-by: Wyatt Hepler <hepler@google.com>
Commit-Queue: Armando Montanez <amontanez@google.com>
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 5a71d5f..e124e12 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -662,6 +662,15 @@
0x70: 25 75 20 25 64 00 54 68 65 20 61 6e 73 77 65 72 %u %d.The answer
0x80: 20 69 73 3a 20 25 73 00 25 6c 6c 75 00 is: %s.%llu.
+
+JSON support
+------------
+While pw_tokenizer doesn't specify a JSON database format, a token database can
+be created from a JSON formatted array of strings. This is useful for side-band
+token database generation for strings that are not embedded as parsable tokens
+in compiled binaries. See :ref:`module-pw_tokenizer-database-creation` for
+instructions on generating a token database from a JSON file.
+
Managing token databases
------------------------
Token databases are managed with the ``database.py`` script. This script can be
@@ -672,20 +681,23 @@
``pw_tokenizer/py/example_binary_with_tokenized_strings.elf``. You can use that
file to experiment with the ``database.py`` commands.
+.. _module-pw_tokenizer-database-creation:
+
Create a database
^^^^^^^^^^^^^^^^^
The ``create`` command makes a new token database from ELF files (.elf, .o, .so,
-etc.), archives (.a), or existing token databases (CSV or binary).
+etc.), archives (.a), existing token databases (CSV or binary), or a JSON file
+containing an array of strings.
.. code-block:: sh
./database.py create --database DATABASE_NAME ELF_OR_DATABASE_FILE...
-Two database formats are supported: CSV and binary. Provide ``--type binary`` to
-``create`` to generate a binary database instead of the default CSV. CSV
-databases are great for checking into a source control or for human review.
-Binary databases are more compact and simpler to parse. The C++ detokenizer
-library only supports binary databases currently.
+Two database output formats are supported: CSV and binary. Provide
+``--type binary`` to ``create`` to generate a binary database instead of the
+default CSV. CSV databases are great for checking into a source control or for
+human review. Binary databases are more compact and simpler to parse. The C++
+detokenizer library only supports binary databases currently.
Update a database
^^^^^^^^^^^^^^^^^
diff --git a/pw_tokenizer/py/database_test.py b/pw_tokenizer/py/database_test.py
index 290c831..74cf995 100755
--- a/pw_tokenizer/py/database_test.py
+++ b/pw_tokenizer/py/database_test.py
@@ -98,6 +98,25 @@
e65aefef, ,"Won't fit : %s%d"
'''
+JSON_SOURCE_STRINGS = '''\
+[
+ "pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h",
+ "protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h",
+ "pigweed/pw_rpc/client_server.cc",
+ "pigweed/pw_rpc/public/pw_rpc/client_server.h",
+ "This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
+]
+'''
+
+CSV_STRINGS = '''\
+2cbf627a, ,"pigweed/pw_rpc/client_server.cc"
+666562a1, ,"protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h"
+6c1e6eb3, ,"pigweed/pw_rpc/public/pw_rpc/client_server.h"
+b25a9932, ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
+eadf017f, ,"pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h"
+f815dc5c, ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
+'''
+
EXPECTED_REPORT = {
str(TOKENIZED_ENTRIES_ELF): {
'': {
@@ -247,6 +266,16 @@
CSV_DEFAULT_DOMAIN.replace('Jello', sub).replace('Hello', sub),
self._csv.read_text())
+ def test_json_strings(self):
+ strings_file = self._dir / "strings.json"
+
+ with open(strings_file, 'w') as file:
+ file.write(JSON_SOURCE_STRINGS)
+
+ run_cli('create', '--force', '--database', self._csv, strings_file)
+ self.assertEqual(CSV_STRINGS.splitlines(),
+ self._csv.read_text().splitlines())
+
class LegacyDatabaseCommandLineTest(DatabaseCommandLineTest):
"""Test an ELF with the legacy plain string storage format."""
diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index 39eb185..37351fa 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py
@@ -166,8 +166,32 @@
return metadata
+def _database_from_strings(strings: List[str]) -> tokens.Database:
+ """Generates a C and C++ compatible database from untokenized strings."""
+ # Generate a C compatible database from the fixed length hash.
+ c_db = tokens.Database.from_strings(
+ strings,
+ tokenize=lambda string: tokens.pw_tokenizer_65599_hash(
+ string, tokens.DEFAULT_C_HASH_LENGTH))
+
+ # Generate a C++ compatible database by allowing the hash to follow the
+ # string length.
+ cpp_db = tokens.Database.from_strings(
+ strings, tokenize=tokens.pw_tokenizer_65599_hash)
+
+ # Use a union of the C and C++ compatible databases.
+ return tokens.Database.merged(c_db, cpp_db)
+
+
+def _database_from_json(fd) -> tokens.Database:
+ return _database_from_strings(json.load(fd))
+
+
def _load_token_database(db, domain: Pattern[str]) -> tokens.Database:
- """Loads a Database from a database object, ELF, CSV, or binary database."""
+ """Loads a Database from supported database types.
+
+ Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
+ """
if db is None:
return tokens.Database()
@@ -177,7 +201,7 @@
if isinstance(db, elf_reader.Elf):
return _database_from_elf(db, domain)
- # If it's a str, it might be a path. Check if it's an ELF or CSV.
+ # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON.
if isinstance(db, (str, Path)):
if not os.path.exists(db):
raise FileNotFoundError(
@@ -188,6 +212,11 @@
if elf_reader.compatible_file(fd):
return _database_from_elf(fd, domain)
+ # Generate a database from JSON.
+ if str(db).endswith('.json'):
+ with open(db, 'r') as json_fd:
+ return _database_from_json(json_fd)
+
# Read the path as a packed binary or CSV file.
return tokens.DatabaseFile(db)
@@ -195,8 +224,12 @@
if elf_reader.compatible_file(db):
return _database_from_elf(db, domain)
- # Read the database as CSV or packed binary from a file object's path.
+ # Read the database as JSON, CSV, or packed binary from a file object's
+ # path.
if hasattr(db, 'name') and os.path.exists(db.name):
+ if db.name.endswith('.json'):
+ return _database_from_json(db)
+
return tokens.DatabaseFile(db.name)
# Read CSV directly from the file object.
@@ -207,7 +240,10 @@
*databases,
domain: Union[str,
Pattern[str]] = tokens.DEFAULT_DOMAIN) -> tokens.Database:
- """Loads a Database from database objects, ELFs, CSVs, or binary files."""
+ """Loads a Database from supported database types.
+
+ Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
+ """
domain = re.compile(domain)
return tokens.Database.merged(*(_load_token_database(db, domain)
for db in databases))
@@ -333,8 +369,9 @@
raise FileNotFoundError(f'{path_or_glob} is not a valid path')
for path in paths:
- # Resolve globs to CSV or compatible binary files.
- if elf_reader.compatible_file(path) or path.endswith('.csv'):
+ # Resolve globs to JSON, CSV, or compatible binary files.
+ if elf_reader.compatible_file(path) or path.endswith(
+ ('.csv', '.json')):
yield Path(path)