pw_tokenizer: Database from JSON

Adds JSON support to the token database creation tool to facilitate
side-band database generation.

Change-Id: I9b7e55d79e45b742882b6df8eb193cba085d4659
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/78100
Reviewed-by: Wyatt Hepler <hepler@google.com>
Commit-Queue: Armando Montanez <amontanez@google.com>
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 5a71d5f..e124e12 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -662,6 +662,15 @@
   0x70: 25 75 20 25 64 00 54 68 65 20 61 6e 73 77 65 72  %u %d.The answer
   0x80: 20 69 73 3a 20 25 73 00 25 6c 6c 75 00            is: %s.%llu.
 
+
+JSON support
+------------
+While pw_tokenizer doesn't specify a JSON database format, a token database can
+be created from a JSON formatted array of strings. This is useful for side-band
+token database generation for strings that are not embedded as parsable tokens
+in compiled binaries. See :ref:`module-pw_tokenizer-database-creation` for
+instructions on generating a token database from a JSON file.
+
 Managing token databases
 ------------------------
 Token databases are managed with the ``database.py`` script. This script can be
@@ -672,20 +681,23 @@
 ``pw_tokenizer/py/example_binary_with_tokenized_strings.elf``. You can use that
 file to experiment with the ``database.py`` commands.
 
+.. _module-pw_tokenizer-database-creation:
+
 Create a database
 ^^^^^^^^^^^^^^^^^
 The ``create`` command makes a new token database from ELF files (.elf, .o, .so,
-etc.), archives (.a), or existing token databases (CSV or binary).
+etc.), archives (.a), existing token databases (CSV or binary), or a JSON file
+containing an array of strings.
 
 .. code-block:: sh
 
   ./database.py create --database DATABASE_NAME ELF_OR_DATABASE_FILE...
 
-Two database formats are supported: CSV and binary. Provide ``--type binary`` to
-``create`` to generate a binary database instead of the default CSV. CSV
-databases are great for checking into a source control or for human review.
-Binary databases are more compact and simpler to parse. The C++ detokenizer
-library only supports binary databases currently.
+Two database output formats are supported: CSV and binary. Provide
+``--type binary`` to ``create`` to generate a binary database instead of the
+default CSV. CSV databases are great for checking into a source control or for
+human review. Binary databases are more compact and simpler to parse. The C++
+detokenizer library only supports binary databases currently.
 
 Update a database
 ^^^^^^^^^^^^^^^^^
diff --git a/pw_tokenizer/py/database_test.py b/pw_tokenizer/py/database_test.py
index 290c831..74cf995 100755
--- a/pw_tokenizer/py/database_test.py
+++ b/pw_tokenizer/py/database_test.py
@@ -98,6 +98,25 @@
 e65aefef,          ,"Won't fit : %s%d"
 '''
 
+JSON_SOURCE_STRINGS = '''\
+[
+  "pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h",
+  "protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h",
+  "pigweed/pw_rpc/client_server.cc",
+  "pigweed/pw_rpc/public/pw_rpc/client_server.h",
+  "This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
+]
+'''
+
+CSV_STRINGS = '''\
+2cbf627a,          ,"pigweed/pw_rpc/client_server.cc"
+666562a1,          ,"protocol_buffer/gen/pigweed/pw_protobuf/common_protos.proto_library/nanopb/pw_protobuf_protos/status.pb.h"
+6c1e6eb3,          ,"pigweed/pw_rpc/public/pw_rpc/client_server.h"
+b25a9932,          ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
+eadf017f,          ,"pigweed/pw_polyfill/standard_library_public/pw_polyfill/standard_library/assert.h"
+f815dc5c,          ,"This is a very long string that will produce two tokens; one for C++ and one for C. This is because this string exceeds the default C hash length."
+'''
+
 EXPECTED_REPORT = {
     str(TOKENIZED_ENTRIES_ELF): {
         '': {
@@ -247,6 +266,16 @@
             CSV_DEFAULT_DOMAIN.replace('Jello', sub).replace('Hello', sub),
             self._csv.read_text())
 
+    def test_json_strings(self):
+        strings_file = self._dir / "strings.json"
+
+        with open(strings_file, 'w') as file:
+            file.write(JSON_SOURCE_STRINGS)
+
+        run_cli('create', '--force', '--database', self._csv, strings_file)
+        self.assertEqual(CSV_STRINGS.splitlines(),
+                         self._csv.read_text().splitlines())
+
 
 class LegacyDatabaseCommandLineTest(DatabaseCommandLineTest):
     """Test an ELF with the legacy plain string storage format."""
diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index 39eb185..37351fa 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py
@@ -166,8 +166,32 @@
     return metadata
 
 
+def _database_from_strings(strings: List[str]) -> tokens.Database:
+    """Generates a C and C++ compatible database from untokenized strings."""
+    # Generate a C compatible database from the fixed length hash.
+    c_db = tokens.Database.from_strings(
+        strings,
+        tokenize=lambda string: tokens.pw_tokenizer_65599_hash(
+            string, tokens.DEFAULT_C_HASH_LENGTH))
+
+    # Generate a C++ compatible database by allowing the hash to follow the
+    # string length.
+    cpp_db = tokens.Database.from_strings(
+        strings, tokenize=tokens.pw_tokenizer_65599_hash)
+
+    # Use a union of the C and C++ compatible databases.
+    return tokens.Database.merged(c_db, cpp_db)
+
+
+def _database_from_json(fd) -> tokens.Database:
+    return _database_from_strings(json.load(fd))
+
+
 def _load_token_database(db, domain: Pattern[str]) -> tokens.Database:
-    """Loads a Database from a database object, ELF, CSV, or binary database."""
+    """Loads a Database from supported database types.
+
+    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
+    """
     if db is None:
         return tokens.Database()
 
@@ -177,7 +201,7 @@
     if isinstance(db, elf_reader.Elf):
         return _database_from_elf(db, domain)
 
-    # If it's a str, it might be a path. Check if it's an ELF or CSV.
+    # If it's a str, it might be a path. Check if it's an ELF, CSV, or JSON.
     if isinstance(db, (str, Path)):
         if not os.path.exists(db):
             raise FileNotFoundError(
@@ -188,6 +212,11 @@
             if elf_reader.compatible_file(fd):
                 return _database_from_elf(fd, domain)
 
+        # Generate a database from JSON.
+        if str(db).endswith('.json'):
+            with open(db, 'r') as json_fd:
+                return _database_from_json(json_fd)
+
         # Read the path as a packed binary or CSV file.
         return tokens.DatabaseFile(db)
 
@@ -195,8 +224,12 @@
     if elf_reader.compatible_file(db):
         return _database_from_elf(db, domain)
 
-    # Read the database as CSV or packed binary from a file object's path.
+    # Read the database as JSON, CSV, or packed binary from a file object's
+    # path.
     if hasattr(db, 'name') and os.path.exists(db.name):
+        if db.name.endswith('.json'):
+            return _database_from_json(db)
+
         return tokens.DatabaseFile(db.name)
 
     # Read CSV directly from the file object.
@@ -207,7 +240,10 @@
     *databases,
     domain: Union[str,
                   Pattern[str]] = tokens.DEFAULT_DOMAIN) -> tokens.Database:
-    """Loads a Database from database objects, ELFs, CSVs, or binary files."""
+    """Loads a Database from supported database types.
+
+    Supports Database objects, JSONs, ELFs, CSVs, and binary databases.
+    """
     domain = re.compile(domain)
     return tokens.Database.merged(*(_load_token_database(db, domain)
                                     for db in databases))
@@ -333,8 +369,9 @@
                 raise FileNotFoundError(f'{path_or_glob} is not a valid path')
 
             for path in paths:
-                # Resolve globs to CSV or compatible binary files.
-                if elf_reader.compatible_file(path) or path.endswith('.csv'):
+                # Resolve globs to JSON, CSV, or compatible binary files.
+                if elf_reader.compatible_file(path) or path.endswith(
+                    ('.csv', '.json')):
                     yield Path(path)