pw_tokenizer: Show collisions in database reports Change-Id: Ied531b531f4e44a8589c98b5707a5fd0c26abeb9 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/26923 Pigweed-Auto-Submit: Wyatt Hepler <hepler@google.com> Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com> Reviewed-by: Ewout van Bekkum <ewout@google.com>

commit: 1013aee684860cac6f02b0cd7aef5aa9a66ac3cc [log] [tgz]
author: Wyatt Hepler <hepler@google.com> Mon Dec 07 15:34:51 2020 -0800
committer: CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Jan 06 02:10:17 2021 +0000
tree: 512ebd3337d5632ddcf8611ec2d7a90aa346efbc
parent: df9b612290c1b7ebf41a2bfd12127562f8903f16 [diff]
diff --git a/pw_tokenizer/py/database_test.py b/pw_tokenizer/py/database_test.py
index 54c890e..de863e3 100755
--- a/pw_tokenizer/py/database_test.py
+++ b/pw_tokenizer/py/database_test.py

@@ -105,14 +105,14 @@
             'present_size_bytes': 289,
             'total_entries': 22,
             'total_size_bytes': 289,
-            'collisions': 0
+            'collisions': {}
         },
         'TEST_DOMAIN': {
             'present_entries': 5,
             'present_size_bytes': 57,
             'total_entries': 5,
             'total_size_bytes': 57,
-            'collisions': 0
+            'collisions': {}
         }
     }
 }

diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index cf21481..9e7075e 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py

@@ -28,8 +28,8 @@
 import re
 import struct
 import sys
-from typing import (Callable, Dict, Iterable, Iterator, List, Pattern, Set,
-                    TextIO, Tuple, Union)
+from typing import (Any, Callable, Dict, Iterable, Iterator, List, Pattern,
+                    Set, TextIO, Tuple, Union)
 
 try:
     from pw_tokenizer import elf_reader, tokens
@@ -213,22 +213,25 @@
                                     for db in databases))
 
 
-def database_summary(db: tokens.Database) -> Dict[str, int]:
+def database_summary(db: tokens.Database) -> Dict[str, Any]:
     """Returns a simple report of properties of the database."""
     present = [entry for entry in db.entries() if not entry.date_removed]
-
-    # Add 1 to each string's size to account for the null terminator.
-    return {
-        'present_entries': len(present),
-        'present_size_bytes': sum(len(entry.string) + 1 for entry in present),
-        'total_entries': len(db.entries()),
-        'total_size_bytes':
-        sum(len(entry.string) + 1 for entry in db.entries()),
-        'collisions': len(db.collisions()),
+    collisions = {
+        token: list(e.string for e in entries)
+        for token, entries in db.collisions()
     }
 
+    # Add 1 to each string's size to account for the null terminator.
+    return dict(
+        present_entries=len(present),
+        present_size_bytes=sum(len(entry.string) + 1 for entry in present),
+        total_entries=len(db.entries()),
+        total_size_bytes=sum(len(entry.string) + 1 for entry in db.entries()),
+        collisions=collisions,
+    )
 
-_DatabaseReport = Dict[str, Dict[str, Dict[str, int]]]
+
+_DatabaseReport = Dict[str, Dict[str, Dict[str, Any]]]
 
 
 def generate_reports(paths: Iterable[Path]) -> _DatabaseReport:

diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index a45f4b0..f7206b1 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py

@@ -22,8 +22,8 @@
 from pathlib import Path
 import re
 import struct
-from typing import BinaryIO, Callable, Dict, Iterable, List, NamedTuple
-from typing import Optional, Pattern, Tuple, Union, ValuesView
+from typing import (BinaryIO, Callable, Dict, Iterable, Iterator, List,
+                    NamedTuple, Optional, Pattern, Tuple, Union, ValuesView)
 
 DATE_FORMAT = '%Y-%m-%d'
 DEFAULT_DOMAIN = ''
@@ -155,11 +155,11 @@
         """Returns iterable over all TokenizedStringEntries in the database."""
         return self._database.values()
 
-    def collisions(self) -> Tuple[Tuple[int, List[TokenizedStringEntry]], ...]:
+    def collisions(self) -> Iterator[Tuple[int, List[TokenizedStringEntry]]]:
         """Returns tuple of (token, entries_list)) for all colliding tokens."""
-        return tuple((token, entries)
-                     for token, entries in self.token_to_entries.items()
-                     if len(entries) > 1)
+        for token, entries in self.token_to_entries.items():
+            if len(entries) > 1:
+                yield token, entries
 
     def mark_removals(
             self,
commit	1013aee684860cac6f02b0cd7aef5aa9a66ac3cc	[log] [tgz]
author	Wyatt Hepler <hepler@google.com>	Mon Dec 07 15:34:51 2020 -0800
committer	CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Jan 06 02:10:17 2021 +0000
tree	512ebd3337d5632ddcf8611ec2d7a90aa346efbc
parent	df9b612290c1b7ebf41a2bfd12127562f8903f16 [diff]