pw_tokenizer: Update committed CSVs in directory databases
- Add untracked and tracked CSV support
- Check whether CSVs are merged or unmerged in a commit
Change-Id: Ib530f3dda5f53d438cf262195a2ba64031a8b40a
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/103500
Commit-Queue: Carlos Chinchilla <cachinchilla@google.com>
Reviewed-by: Wyatt Hepler <hepler@google.com>
Reviewed-by: Leonardo Acosta <leoacosta@google.com>
diff --git a/pw_tokenizer/database.gni b/pw_tokenizer/database.gni
index 07d441f..f5cc21b 100644
--- a/pw_tokenizer/database.gni
+++ b/pw_tokenizer/database.gni
@@ -102,6 +102,12 @@
if (_create == "") {
args = [ "add" ]
+ if (defined(invoker.commit)) {
+ args += [
+ "--discard-temporary",
+ invoker.commit,
+ ]
+ }
inputs += [ _database ]
stamp = true
} else {
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index e81e8ed..ff326c6 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -780,14 +780,31 @@
A directory database is a collection of CSV files with unrepeated filenames.
The ``add`` command creates a CSV in the database when new token entries exist
from the supplied database type. The resulting CSV contains only new tokens.
-The filename is unique in the database folder and is named after a universally
-unique identifier (UUID). This is helpful when there are asynchronous additions
-to the database as merge conflicts are avoided.
+Every filename in the database folder is named after a universally unique
+identifier (UUID). This is helpful when there are asynchronous additions to
+the database as merge conflicts are avoided.
.. code-block:: sh
./database.py add --database DIR_DATABASE_NAME ELF_OR_DATABASE_FILE
+The ``discard-temporary`` argument allows developers to input a git commit
+hash or alias to maintain a single CSV file since such commit. After the
+initial CSV is created with new tokens, passing a git commit hash or alias
+allows for a single CSV to be maintained in the latest commit with new entries
+that appear after each build. When a git repository does not exists or commit
+is not provided, the same functionality as ``add`` is invoked.
+
+.. code-block:: sh
+
+ ./database.py add --discard-temporary COMMIT --database DIR_DATABASE_NAME
+ ELF_OR_DATABASE_FILE
+
+Assuming a CSV file exists in the provided commit within the directory database.
+The ``add`` command with ``discard-temporary`` adds new token entries and discards
+any entries from the retrieved CSV that exist in the current build and not in the
+previous build. The CSV is retrieved by using the provided commit.
+
GN integration
^^^^^^^^^^^^^^
Token databases may be updated or created as part of a GN build. The
diff --git a/pw_tokenizer/py/database_test.py b/pw_tokenizer/py/database_test.py
index 0fff8f4..496f6f4 100755
--- a/pw_tokenizer/py/database_test.py
+++ b/pw_tokenizer/py/database_test.py
@@ -16,8 +16,11 @@
import json
import io
+import os
from pathlib import Path
import shutil
+import stat
+import subprocess
import sys
import tempfile
import unittest
@@ -158,6 +161,16 @@
return io.TextIOWrapper(output, write_through=True)
+def _remove_readonly(func, path, excinfo) -> None: # pylint: disable=unused-argument
+ """Changes file permission and recalls the calling function."""
+ print('Path attempted to be deleted:', path)
+ if not os.access(path, os.W_OK):
+ # Change file permissions.
+ os.chmod(path, stat.S_IWUSR)
+ # Call the calling function again.
+ func(path)
+
+
class DatabaseCommandLineTest(unittest.TestCase):
"""Tests the database.py command line interface."""
def setUp(self) -> None:
@@ -318,9 +331,10 @@
self._csv_test_domain = CSV_TEST_DOMAIN
def tearDown(self) -> None:
- shutil.rmtree(self._dir)
+ shutil.rmtree(self._dir, onerror=_remove_readonly)
def test_add_csv_to_dir(self) -> None:
+ """Tests a CSV can be created within the database."""
run_cli('add', '--database', self._db_dir, f'{self._elf}#TEST_DOMAIN')
directory = list(self._db_dir.iterdir())
@@ -332,6 +346,7 @@
self._db_csv.read_text().splitlines())
def test_add_all_domains_to_dir(self) -> None:
+ """Tests a CSV with all domains can be added to the database."""
run_cli('add', '--database', self._db_dir, f'{self._elf}#.*')
directory = list(self._db_dir.iterdir())
@@ -343,6 +358,7 @@
self._db_csv.read_text().splitlines())
def test_not_adding_existing_tokens(self) -> None:
+ """Tests duplicate tokens are not added to the database."""
run_cli('add', '--database', self._db_dir, f'{self._elf}#TEST_DOMAIN')
run_cli('add', '--database', self._db_dir, f'{self._elf}#TEST_DOMAIN')
directory = list(self._db_dir.iterdir())
@@ -354,6 +370,155 @@
self.assertEqual(self._csv_test_domain.splitlines(),
self._db_csv.read_text().splitlines())
+ def test_adding_tokens_without_git_repo(self):
+ """Tests creating new files with new entries when no repo exists."""
+ # Add CSV_TEST_DOMAIN to a new CSV in the directory database.
+ run_cli('add', '--database', self._db_dir, f'{self._elf}#TEST_DOMAIN')
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+
+ first_csv_in_db = directory.pop()
+
+ self.assertEqual(self._csv_test_domain.splitlines(),
+ first_csv_in_db.read_text().splitlines())
+ # Add CSV_ALL_DOMAINS to a new CSV in the directory database.
+ run_cli('add', '--database', self._db_dir, f'{self._elf}#.*')
+ directory = list(self._db_dir.iterdir())
+ # Assert two different CSVs were created to store new tokens.
+ self.assertEqual(2, len(directory))
+ # Retrieve the other CSV in the directory.
+ second_csv_in_db = directory[
+ 0] if directory[0] != first_csv_in_db else directory[1]
+
+ self.assertNotEqual(first_csv_in_db, second_csv_in_db)
+ self.assertEqual(self._csv_test_domain.splitlines(),
+ first_csv_in_db.read_text().splitlines())
+
+ # Retrieve entries that exclusively exist in CSV_ALL_DOMAINS
+ # as CSV_ALL_DOMAINS contains all entries in TEST_DOMAIN.
+ entries_exclusively_in_all_domain = (
+ set(CSV_ALL_DOMAINS.splitlines()) -
+ set(self._csv_test_domain.splitlines()))
+ # Ensure only new tokens not in CSV_TEST_DOMAIN were added to
+ # the second CSV added to the directory database.
+ self.assertEqual(entries_exclusively_in_all_domain,
+ set(second_csv_in_db.read_text().splitlines()))
+
+ def test_untracked_files_in_dir(self):
+ """Tests untracked CSVs are reused by the database."""
+ subprocess.run(['git', 'init'], cwd=self._dir)
+ # Add CSV_TEST_DOMAIN to a new CSV in the directory database.
+ run_cli('add', '--database', self._db_dir, '--discard-temporary',
+ 'HEAD', f'{self._elf}#TEST_DOMAIN')
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+
+ first_path_in_db = directory.pop()
+
+ self.assertEqual(self._csv_test_domain.splitlines(),
+ first_path_in_db.read_text().splitlines())
+ # Retrieve the untracked CSV in the Git repository and discard
+ # tokens that do not exist in CSV_DEFAULT_DOMAIN.
+ run_cli('add', '--database', self._db_dir, '--discard-temporary',
+ 'HEAD', self._elf)
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+
+ reused_path_in_db = directory.pop()
+ # Ensure the first path created is the same being reused. Also,
+ # the CSV content is the same as CSV_DEFAULT_DOMAIN.
+ self.assertEqual(first_path_in_db, reused_path_in_db)
+ self.assertEqual(CSV_DEFAULT_DOMAIN.splitlines(),
+ reused_path_in_db.read_text().splitlines())
+
+ def test_adding_multiple_elf_files(self) -> None:
+ """Tests adding multiple elf files to a file in the database."""
+ # Add CSV_TEST_DOMAIN to a new CSV in the directory database.
+ run_cli('add', '--database', self._db_dir, f'{self._elf}#TEST_DOMAIN',
+ self._elf)
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+ # Combines CSV_DEFAULT_DOMAIN and TEST_DOMAIN into a unique set
+ # of token entries.
+ entries_from_default_and_test_domain = set(
+ CSV_DEFAULT_DOMAIN.splitlines()).union(
+ set(self._csv_test_domain.splitlines()))
+ # Multiple ELF files were added at once to a single CSV.
+ self.assertEqual(entries_from_default_and_test_domain,
+ set(directory.pop().read_text().splitlines()))
+
+ def test_discarding_old_entries(self) -> None:
+ """Tests discarding old entries for new entries when re-adding."""
+ subprocess.run(['git', 'init'], cwd=self._dir)
+ # Add CSV_ALL_DOMAINS to a new CSV in the directory database.
+ run_cli('add', '--database', self._db_dir, '--discard-temporary',
+ 'HEAD', f'{self._elf}#.*')
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+
+ untracked_path_in_db = directory.pop()
+
+ self.assertEqual(CSV_ALL_DOMAINS.splitlines(),
+ untracked_path_in_db.read_text().splitlines())
+ # Add CSV_DEFAULT_DOMAIN and CSV_TEST_DOMAIN to a CSV in the
+ # directory database, while replacing entries in CSV_ALL_DOMAINS
+ # that no longer exist.
+ run_cli('add', '--database', self._db_dir, '--discard-temporary',
+ 'HEAD', f'{self._elf}#TEST_DOMAIN', self._elf)
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+
+ reused_path_in_db = directory.pop()
+ # Combines CSV_DEFAULT_DOMAIN and TEST_DOMAIN.
+ entries_from_default_and_test_domain = set(
+ CSV_DEFAULT_DOMAIN.splitlines()).union(
+ set(self._csv_test_domain.splitlines()))
+
+ self.assertEqual(untracked_path_in_db, reused_path_in_db)
+ self.assertEqual(entries_from_default_and_test_domain,
+ set(reused_path_in_db.read_text().splitlines()))
+
+ def test_retrieving_csv_from_commit(self) -> None:
+ """Tests retrieving a CSV from a commit and removing temp tokens."""
+ subprocess.run(['git', 'init'], cwd=self._dir)
+ subprocess.run(
+ ['git', 'commit', '--allow-empty', '-m', 'First Commit'],
+ cwd=self._dir)
+ # Add CSV_ALL_DOMAINS to a new CSV in the directory database.
+ run_cli('add', '--database', self._db_dir, f'{self._elf}#.*')
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+
+ tracked_path_in_db = directory.pop()
+
+ self.assertEqual(CSV_ALL_DOMAINS.splitlines(),
+ tracked_path_in_db.read_text().splitlines())
+ # Commit the CSV to avoid retrieving the CSV with the checks
+ # for untracked changes.
+ subprocess.run(['git', 'add', '--all'], cwd=self._dir)
+ subprocess.run(
+ ['git', 'commit', '-m', 'Adding a CSV to a new commit.'],
+ cwd=self._dir)
+ # Retrieve the CSV in HEAD and discard tokens that exist in
+ # CSV_ALL_DOMAINS and do not exist in CSV_TEST_DOMAIN.
+ run_cli('add', '--database', self._db_dir, '--discard-temporary',
+ 'HEAD~', f'{self._elf}#TEST_DOMAIN')
+ directory = list(self._db_dir.iterdir())
+
+ self.assertEqual(1, len(directory))
+
+ reused_path_in_db = directory.pop()
+
+ self.assertEqual(self._csv_test_domain.splitlines(),
+ reused_path_in_db.read_text().splitlines())
+
if __name__ == '__main__':
unittest.main()
diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index 1d67042..0852732 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py
@@ -21,6 +21,7 @@
import argparse
from datetime import datetime
import glob
+import itertools
import json
import logging
import os
@@ -323,18 +324,24 @@
def _handle_add(token_database: tokens.DatabaseFile,
- databases: List[tokens.Database]) -> None:
+ databases: List[tokens.Database],
+ commit: Optional[str]) -> None:
initial = len(token_database)
+ if commit:
+ entries = itertools.chain.from_iterable(db.entries()
+ for db in databases)
+ token_database.add_and_discard_temporary(entries, commit)
+ else:
+ for source in databases:
+ token_database.add(source.entries())
- for source in databases:
- token_database.add(source.entries())
+ token_database.write_to_file()
- number_of_entries_added = len(token_database) - initial
+ number_of_changes = len(token_database) - initial
- token_database.write_to_file()
-
- _LOG.info('Added %d entries to %s', number_of_entries_added,
- token_database.path)
+ if number_of_changes:
+ _LOG.info('Added %d entries to %s', number_of_changes,
+ token_database.path)
def _handle_mark_removed(token_database: tokens.DatabaseFile,
@@ -564,6 +571,14 @@
'of ELF files or other token databases. Missing entries are NOT '
'marked as removed.'))
subparser.set_defaults(handler=_handle_add)
+ subparser.add_argument(
+ '--discard-temporary',
+ dest='commit',
+ help=
+ ('Deletes temporary tokens in memory and on disk when a CSV exists '
+ 'within a commit. Afterwards, new strings are added to the database '
+ 'from a set of ELF files or other token databases. Missing entries '
+ 'are NOT marked as removed.'))
# The 'mark_removed' command marks removed entries to match a set of ELFs.
subparser = subparsers.add_parser(
diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index 9272ac6..ea75607 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py
@@ -23,6 +23,7 @@
from pathlib import Path
import re
import struct
+import subprocess
from typing import (BinaryIO, Callable, Dict, Iterable, Iterator, List,
NamedTuple, Optional, Pattern, TextIO, Tuple, Union,
ValuesView)
@@ -487,6 +488,16 @@
def write_to_file(self) -> None:
"""Exports in the original format to the original path."""
+ @abstractmethod
+ def add_and_discard_temporary(self,
+ entries: Iterable[TokenizedStringEntry],
+ commit: str) -> None:
+ """Discards and adds entries to export in the original format.
+
+ Adds entries after removing temporary entries from the Database
+ to exclusively write re-occurring entries into memory and disk.
+ """
+
class _BinaryDatabase(DatabaseFile):
def __init__(self, path: Path, fd: BinaryIO) -> None:
@@ -497,6 +508,13 @@
with self.path.open('wb') as fd:
write_binary(self, fd)
+ def add_and_discard_temporary(self,
+ entries: Iterable[TokenizedStringEntry],
+ commit: str) -> None:
+ # TODO(b/241471465): Implement adding new tokens and removing
+ # temporary entries for binary databases.
+ raise NotImplementedError
+
class _CSVDatabase(DatabaseFile):
def __init__(self, path: Path, fd: TextIO) -> None:
@@ -507,6 +525,13 @@
with self.path.open('wb') as fd:
write_csv(self, fd)
+ def add_and_discard_temporary(self,
+ entries: Iterable[TokenizedStringEntry],
+ commit: str) -> None:
+ # TODO(b/241471465): Implement adding new tokens and removing
+ # temporary entries for CSV databases.
+ raise NotImplementedError
+
def _parse_directory(paths: Iterable[Path]) -> Iterable[TokenizedStringEntry]:
"""Parses TokenizedStringEntries from files in the directory as a CSV."""
@@ -515,31 +540,137 @@
yield from parse_csv(fd)
+def _git_stdout(commands: List, cwd: Path) -> str:
+ """Runs the git commands in the provided cwd and captures the output."""
+ return subprocess.run(['git'] + commands,
+ capture_output=True,
+ check=True,
+ cwd=cwd,
+ text=True).stdout.strip()
+
+
+def _new_entries(
+ entries: Iterable[TokenizedStringEntry],
+ database: Dict[_EntryKey,
+ TokenizedStringEntry]) -> List[TokenizedStringEntry]:
+ """Retrieves the entries not in the database."""
+ return [entry for entry in entries if entry.key() not in database]
+
+
class _DirectoryDatabase(DatabaseFile):
def __init__(self, directory: Path) -> None:
- # Create a DatabaseFile using the directory.
super().__init__(directory, _parse_directory(directory.iterdir()))
- self._original_entries = self._database.copy()
- # TODO(b/239551346): Check whether a CSV exists in
- # HEAD. Exclude the CSV when loading the database.
- # Generate a unique filename not in the directory.
- self._csv_file = directory / f'{uuid4().hex}.csv'
- while self._csv_file.exists():
- self._csv_file = directory / f'{uuid4().hex}.csv'
def write_to_file(self) -> None:
"""Exports the database in CSV format to the original path."""
- new_entries = self._get_new_token_entries()
+ database = Database(_parse_directory(self.path.iterdir()))._database # pylint: disable=protected-access
+ new_entries = _new_entries(self.entries(), database)
if new_entries:
- with self._csv_file.open('wb') as fd:
- for entry in new_entries:
- _write_csv_line(fd, entry)
+ with self._create_filename().open('wb') as fd:
+ write_csv(Database(new_entries), fd)
- def _get_new_token_entries(self) -> List[TokenizedStringEntry]:
- """Collects new entries and returns the total new entries found."""
- new_token_entries = []
- for entry in sorted(self.entries()):
- original_entry = self._original_entries.get(entry.key())
- if original_entry != entry:
- new_token_entries.append(entry)
- return new_token_entries
+ def _path_to_repo(self) -> Path:
+ """Creates a path to the repository the database is within."""
+ return Path(_git_stdout(['rev-parse', '--show-toplevel'],
+ self.path)).resolve()
+
+ def _git_paths(self, commands: List, path_to_repo: Path) -> List[Path]:
+ """Creates a list of complete paths to the repo."""
+ try:
+ changes = _git_stdout(commands + [self.path],
+ path_to_repo).splitlines()
+ return [path_to_repo / path for path in changes]
+ except subprocess.CalledProcessError:
+ return []
+
+ def _find_latest_csv(self, commit: str) -> Path:
+ """Finds or creates a CSV to utilize for new entries.
+
+ - Searches for untracked files in the repo to re-use.
+ - When no untracked files exists, a difference between commit
+ and HEAD searches for a filename in the commit to re-use.
+ - Multiple differences between HEAD~ and HEAD causes a search
+ for a CSV in the latest commit.
+ - When nothing is yielded from the previous searches or no git repo
+ exists, then a new filename is created.
+ """
+ # Finds the repository the database is within.
+ try:
+ path_to_repo = self._path_to_repo()
+ except subprocess.CalledProcessError:
+ return self._create_filename()
+
+ # Find untracked_changes in directory.
+ untracked_changes = self._git_paths(
+ ['ls-files', '--others', '--exclude-standard'], path_to_repo)
+
+ if untracked_changes:
+ # TODO(b/241297219): Handle multiple paths in the commit
+ # There are uncommitted files in the repo.
+ assert len(untracked_changes) < 2
+ return untracked_changes.pop()
+
+ # TODO(b/241297219): Create an argument to specify what to run
+ # the git difference on.
+ # Find differences between origin/HEAD and HEAD commit in directory.
+ tracked_changes = self._git_paths(
+ ['diff', '--name-only', '--diff-filter=A', commit], path_to_repo)
+
+ if tracked_changes:
+ # If a single change exists in the comparison, the filename will
+ # utilized. Otherwise, when multiple files exists in a commit and
+ # a filename exist in the latest commit, use the latest commit to
+ # retrieve a CSV.
+ if len(tracked_changes) == 1:
+ return tracked_changes.pop()
+
+ tracked_changes_from_latest_commit = self._git_paths(
+ ['diff', '--name-only', '--diff-filter=A', 'HEAD~'],
+ path_to_repo)
+ # TODO(b/241297219): Handle multiple paths in the commit.
+ assert len(tracked_changes_from_latest_commit) < 2
+ return tracked_changes_from_latest_commit.pop()
+
+ return self._create_filename()
+
+ def _create_filename(self) -> Path:
+ """Generates a unique filename not in the directory."""
+ # Tracked and untracked files do not exist in the repo.
+ path = self.path / f'{uuid4().hex}.csv'
+ while path.exists():
+ path = self.path / f'{uuid4().hex}.csv'
+ return path
+
+ def add_and_discard_temporary(self,
+ entries: Iterable[TokenizedStringEntry],
+ commit: str) -> None:
+ """Adds and updates entries to export in the CSV format.
+
+ - New entries are collected from the incoming entries.
+ - When no CSV exists, the new entries are loaded into a database and
+ written to disk.
+ - When a CSV exists and incoming entries do not exist within the CSV
+ database, the temporary entries are removed from the directory and
+ CSV database.
+ - New entries are added to the CSV database and re-written to disk.
+ """
+ db = Database(entries)._database # pylint: disable=protected-access
+ new_entries = _new_entries(db.values(), self._database)
+ csv_path = self._find_latest_csv(commit)
+
+ if csv_path.exists():
+ # Loading the CSV as a DatabaseFile.
+ csv_db = DatabaseFile.create(csv_path)
+ # Collects the keys to delete from the CSV database.
+ keys_to_delete = (e.key()
+ for e in _new_entries(csv_db.entries(), db))
+ for key in keys_to_delete:
+ del self._database[key]
+ del csv_db._database[key] # pylint: disable=protected-access
+ csv_db.add(new_entries)
+ csv_db.write_to_file()
+ elif new_entries:
+ with csv_path.open('wb') as fd:
+ write_csv(Database(new_entries), fd)
+
+ self.add(new_entries)
diff --git a/pw_tokenizer/py/tokens_test.py b/pw_tokenizer/py/tokens_test.py
index 292043d..a22ae04 100755
--- a/pw_tokenizer/py/tokens_test.py
+++ b/pw_tokenizer/py/tokens_test.py
@@ -594,8 +594,8 @@
self._db_csv.write_text(CSV_DATABASE)
csv = tokens.DatabaseFile.create(self._db_csv)
directory_db = database.load_token_database(self._db_dir)
- self.assertEqual(str(csv), str(directory_db))
self.assertEqual(1, len(list(self._db_dir.iterdir())))
+ self.assertEqual(str(csv), str(directory_db))
def test_loading_multiples_files(self) -> None:
self._db_csv.write_text(CSV_DATABASE_3)
@@ -612,8 +612,8 @@
all_databases_merged = tokens.Database.merged(first_csv, second_csv,
third_csv)
directory_db = database.load_token_database(self._db_dir)
- self.assertEqual(str(all_databases_merged), str(directory_db))
self.assertEqual(3, len(list(self._db_dir.iterdir())))
+ self.assertEqual(str(all_databases_merged), str(directory_db))
def test_loading_multiples_files_with_removal_dates(self) -> None:
self._db_csv.write_text(CSV_DATABASE)
@@ -630,9 +630,8 @@
all_databases_merged = tokens.Database.merged(first_csv, second_csv,
third_csv)
directory_db = database.load_token_database(self._db_dir)
-
- self.assertEqual(str(all_databases_merged), str(directory_db))
self.assertEqual(3, len(list(self._db_dir.iterdir())))
+ self.assertEqual(str(all_databases_merged), str(directory_db))
if __name__ == '__main__':