pw_tokenizer: Keep the newest removal date for duplicate tokens
- If entries are added to a database that already contains them, update
to the newest date removed.
- Have the Database __init__ call self.add(entries) so semantics are
consistent.
- Restore the DirectoryDatabase test that was deleted because the
semantics for dates removed with duplicate tokens had not been
defined.
Change-Id: Id2d6bb60537a51521c2e9ed26c838a009c73294a
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/104681
Reviewed-by: Carlos Chinchilla <cachinchilla@google.com>
Commit-Queue: Auto-Submit <auto-submit@pigweed.google.com.iam.gserviceaccount.com>
Pigweed-Auto-Submit: Wyatt Hepler <hepler@google.com>
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 8faff33..80c61eaf 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -741,6 +741,10 @@
./database.py add --database DATABASE_NAME ELF_OR_DATABASE_FILE...
+This command adds new tokens from ELF files or other databases to the database.
+Adding tokens already present in the database updates the date removed, if any,
+to the latest.
+
A CSV token database can be checked into a source repository and updated as code
changes are made. The build system can invoke ``database.py`` to update the
database after each build.
diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index a93d448..9272ac6 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py
@@ -119,14 +119,13 @@
def __init__(self, entries: Iterable[TokenizedStringEntry] = ()):
"""Creates a token database."""
# The database dict stores each unique (token, string) entry.
- self._database: Dict[_EntryKey, TokenizedStringEntry] = {
- entry.key(): entry
- for entry in entries
- }
+ self._database: Dict[_EntryKey, TokenizedStringEntry] = {}
# This is a cache for fast token lookup that is built as needed.
self._cache: Optional[Dict[int, List[TokenizedStringEntry]]] = None
+ self.add(entries)
+
@classmethod
def from_strings(
cls,
@@ -204,7 +203,10 @@
return removed
def add(self, entries: Iterable[TokenizedStringEntry]) -> None:
- """Adds new entries and updates date_removed for existing entries."""
+ """Adds new entries and updates date_removed for existing entries.
+
+ If the added tokens have removal dates, the newest date is used.
+ """
self._cache = None
for new_entry in entries:
@@ -212,10 +214,17 @@
try:
entry = self._database[new_entry.key()]
entry.domain = new_entry.domain
- entry.date_removed = None
+
+ # Keep the latest removal date between the two entries.
+ if new_entry.date_removed is None:
+ entry.date_removed = None
+ elif (entry.date_removed
+ and entry.date_removed < new_entry.date_removed):
+ entry.date_removed = new_entry.date_removed
except KeyError:
+ # Make a copy to avoid unintentially updating the database.
self._database[new_entry.key()] = TokenizedStringEntry(
- new_entry.token, new_entry.string, new_entry.domain)
+ **vars(new_entry))
def purge(
self,
diff --git a/pw_tokenizer/py/tokens_test.py b/pw_tokenizer/py/tokens_test.py
index 03bcf45..292043d 100755
--- a/pw_tokenizer/py/tokens_test.py
+++ b/pw_tokenizer/py/tokens_test.py
@@ -14,7 +14,7 @@
# the License.
"""Tests for the tokens module."""
-import datetime
+from datetime import datetime
import io
import logging
from pathlib import Path
@@ -209,7 +209,7 @@
self.assertEqual(jello.token, 0x2e668cd6)
self.assertEqual(jello.string, 'Jello, world!')
- self.assertEqual(jello.date_removed, datetime.datetime(2019, 6, 11))
+ self.assertEqual(jello.date_removed, datetime(2019, 6, 11))
matches = db.token_to_entries[0xe13b0f94]
self.assertEqual(len(matches), 1)
@@ -246,7 +246,7 @@
self.assertEqual(db.token_to_entries[0xe65aefef][0].string,
"Won't fit : %s%d")
- db.purge(datetime.datetime(2019, 6, 11))
+ db.purge(datetime(2019, 6, 11))
self.assertLess(len(db.token_to_entries), original_length)
self.assertFalse(db.token_to_entries[0])
@@ -264,30 +264,30 @@
# Test basic merging into an empty database.
db.merge(
tokens.Database([
- tokens.TokenizedStringEntry(
- 1, 'one', date_removed=datetime.datetime.min),
- tokens.TokenizedStringEntry(
- 2, 'two', date_removed=datetime.datetime.min),
+ tokens.TokenizedStringEntry(1,
+ 'one',
+ date_removed=datetime.min),
+ tokens.TokenizedStringEntry(2,
+ 'two',
+ date_removed=datetime.min),
]))
self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'})
- self.assertEqual(db.token_to_entries[1][0].date_removed,
- datetime.datetime.min)
- self.assertEqual(db.token_to_entries[2][0].date_removed,
- datetime.datetime.min)
+ self.assertEqual(db.token_to_entries[1][0].date_removed, datetime.min)
+ self.assertEqual(db.token_to_entries[2][0].date_removed, datetime.min)
# Test merging in an entry with a removal date.
db.merge(
tokens.Database([
tokens.TokenizedStringEntry(3, 'three'),
- tokens.TokenizedStringEntry(
- 4, 'four', date_removed=datetime.datetime.min),
+ tokens.TokenizedStringEntry(4,
+ 'four',
+ date_removed=datetime.min),
]))
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four'})
self.assertIsNone(db.token_to_entries[3][0].date_removed)
- self.assertEqual(db.token_to_entries[4][0].date_removed,
- datetime.datetime.min)
+ self.assertEqual(db.token_to_entries[4][0].date_removed, datetime.min)
# Test merging in one entry.
db.merge(tokens.Database([
@@ -296,24 +296,24 @@
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four', 'five'})
- self.assertEqual(db.token_to_entries[4][0].date_removed,
- datetime.datetime.min)
+ self.assertEqual(db.token_to_entries[4][0].date_removed, datetime.min)
self.assertIsNone(db.token_to_entries[5][0].date_removed)
# Merge in repeated entries different removal dates.
db.merge(
tokens.Database([
- tokens.TokenizedStringEntry(
- 4, 'four', date_removed=datetime.datetime.max),
- tokens.TokenizedStringEntry(
- 5, 'five', date_removed=datetime.datetime.max),
+ tokens.TokenizedStringEntry(4,
+ 'four',
+ date_removed=datetime.max),
+ tokens.TokenizedStringEntry(5,
+ 'five',
+ date_removed=datetime.max),
]))
self.assertEqual(len(db.entries()), 5)
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four', 'five'})
- self.assertEqual(db.token_to_entries[4][0].date_removed,
- datetime.datetime.max)
+ self.assertEqual(db.token_to_entries[4][0].date_removed, datetime.max)
self.assertIsNone(db.token_to_entries[5][0].date_removed)
# Merge in the same repeated entries now without removal dates.
@@ -341,17 +341,17 @@
tokens.Database([
tokens.TokenizedStringEntry(1,
'one',
- date_removed=datetime.datetime.max)
+ date_removed=datetime.max)
]),
tokens.Database([
tokens.TokenizedStringEntry(2,
'two',
- date_removed=datetime.datetime.min)
+ date_removed=datetime.min)
]),
tokens.Database([
tokens.TokenizedStringEntry(1,
'one',
- date_removed=datetime.datetime.min)
+ date_removed=datetime.min)
]))
self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'})
@@ -359,17 +359,17 @@
tokens.Database([
tokens.TokenizedStringEntry(4,
'four',
- date_removed=datetime.datetime.max)
+ date_removed=datetime.max)
]),
tokens.Database([
tokens.TokenizedStringEntry(2,
'two',
- date_removed=datetime.datetime.max)
+ date_removed=datetime.max)
]),
tokens.Database([
tokens.TokenizedStringEntry(3,
'three',
- date_removed=datetime.datetime.min)
+ date_removed=datetime.min)
]))
self.assertEqual({str(e)
for e in db.entries()},
@@ -395,7 +395,7 @@
self.assertTrue(
all(entry.date_removed is None for entry in db.entries()))
- date_1 = datetime.datetime(1, 2, 3)
+ date_1 = datetime(1, 2, 3)
db.mark_removed(_entries('apples', 'oranges', 'pears'), date_1)
@@ -405,7 +405,7 @@
db.token_to_entries[default_hash('CHEESE')][0].date_removed,
date_1)
- now = datetime.datetime.now()
+ now = datetime.now()
db.mark_removed(_entries('MILK', 'CHEESE', 'pears'))
# New strings are not added or re-added in mark_removed().
@@ -441,6 +441,27 @@
'only this one is new'
})
+ def test_add_duplicate_entries_keeps_none_as_removal_date(self) -> None:
+ db = tokens.Database()
+ db.add([
+ tokens.TokenizedStringEntry(1, 'Spam', '', datetime.now()),
+ tokens.TokenizedStringEntry(1, 'Spam', ''),
+ tokens.TokenizedStringEntry(1, 'Spam', '', datetime.min),
+ ])
+ self.assertEqual(len(db), 1)
+ self.assertIsNone(db.token_to_entries[1][0].date_removed)
+
+ def test_add_duplicate_entries_keeps_newest_removal_date(self) -> None:
+ db = tokens.Database()
+ db.add([
+ tokens.TokenizedStringEntry(1, 'Spam', '', datetime.now()),
+ tokens.TokenizedStringEntry(1, 'Spam', '', datetime.max),
+ tokens.TokenizedStringEntry(1, 'Spam', '', datetime.now()),
+ tokens.TokenizedStringEntry(1, 'Spam', '', datetime.min),
+ ])
+ self.assertEqual(len(db), 1)
+ self.assertEqual(db.token_to_entries[1][0].date_removed, datetime.max)
+
def test_binary_format_write(self) -> None:
db = read_db_from_csv(CSV_DATABASE)
@@ -594,6 +615,25 @@
self.assertEqual(str(all_databases_merged), str(directory_db))
self.assertEqual(3, len(list(self._db_dir.iterdir())))
+ def test_loading_multiples_files_with_removal_dates(self) -> None:
+ self._db_csv.write_text(CSV_DATABASE)
+ first_csv = tokens.DatabaseFile.create(self._db_csv)
+
+ path_to_second_csv = self._db_dir / 'second.csv'
+ path_to_second_csv.write_text(CSV_DATABASE_2)
+ second_csv = tokens.DatabaseFile.create(path_to_second_csv)
+
+ path_to_third_csv = self._db_dir / 'third.csv'
+ path_to_third_csv.write_text(CSV_DATABASE_3)
+ third_csv = tokens.DatabaseFile.create(path_to_third_csv)
+
+ all_databases_merged = tokens.Database.merged(first_csv, second_csv,
+ third_csv)
+ directory_db = database.load_token_database(self._db_dir)
+
+ self.assertEqual(str(all_databases_merged), str(directory_db))
+ self.assertEqual(3, len(list(self._db_dir.iterdir())))
+
if __name__ == '__main__':
unittest.main()