| #!/usr/bin/env python3 |
| # Copyright 2020 The Pigweed Authors |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| # use this file except in compliance with the License. You may obtain a copy of |
| # the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| # License for the specific language governing permissions and limitations under |
| # the License. |
| """Tests for the tokens module.""" |
| |
| import datetime |
| import io |
| import logging |
| import unittest |
| |
| from pw_tokenizer import tokens |
| from pw_tokenizer.tokens import _LOG |
| |
| CSV_DATABASE = '''\ |
| 00000000,2019-06-10,"" |
| 141c35d5, ,"The answer: ""%s""" |
| 2db1515f, ,"%u%d%02x%X%hu%hhu%d%ld%lu%lld%llu%c%c%c" |
| 2e668cd6,2019-06-11,"Jello, world!" |
| 31631781, ,"%d" |
| 61fd1e26, ,"%ld" |
| 68ab92da, ,"%s there are %x (%.2f) of them%c" |
| 7b940e2a, ,"Hello %s! %hd %e" |
| 851beeb6, ,"%u %d" |
| 881436a0, ,"The answer is: %s" |
| ad002c97, ,"%llx" |
| b3653e13,2019-06-12,"Jello!" |
| b912567b, ,"%x%lld%1.2f%s" |
| cc6d3131,2020-01-01,"Jello?" |
| e13b0f94, ,"%llu" |
| e65aefef,2019-06-10,"Won't fit : %s%d" |
| ''' |
| |
| # The date 2019-06-10 is 07E3-06-0A in hex. In database order, it's 0A 06 E3 07. |
| BINARY_DATABASE = ( |
| b'TOKENS\x00\x00\x10\x00\x00\x00\0\0\0\0' # header (0x10 entries) |
| b'\x00\x00\x00\x00\x0a\x06\xe3\x07' # 0x01 |
| b'\xd5\x35\x1c\x14\xff\xff\xff\xff' # 0x02 |
| b'\x5f\x51\xb1\x2d\xff\xff\xff\xff' # 0x03 |
| b'\xd6\x8c\x66\x2e\x0b\x06\xe3\x07' # 0x04 |
| b'\x81\x17\x63\x31\xff\xff\xff\xff' # 0x05 |
| b'\x26\x1e\xfd\x61\xff\xff\xff\xff' # 0x06 |
| b'\xda\x92\xab\x68\xff\xff\xff\xff' # 0x07 |
| b'\x2a\x0e\x94\x7b\xff\xff\xff\xff' # 0x08 |
| b'\xb6\xee\x1b\x85\xff\xff\xff\xff' # 0x09 |
| b'\xa0\x36\x14\x88\xff\xff\xff\xff' # 0x0a |
| b'\x97\x2c\x00\xad\xff\xff\xff\xff' # 0x0b |
| b'\x13\x3e\x65\xb3\x0c\x06\xe3\x07' # 0x0c |
| b'\x7b\x56\x12\xb9\xff\xff\xff\xff' # 0x0d |
| b'\x31\x31\x6d\xcc\x01\x01\xe4\x07' # 0x0e |
| b'\x94\x0f\x3b\xe1\xff\xff\xff\xff' # 0x0f |
| b'\xef\xef\x5a\xe6\x0a\x06\xe3\x07' # 0x10 |
| b'\x00' |
| b'The answer: "%s"\x00' |
| b'%u%d%02x%X%hu%hhu%d%ld%lu%lld%llu%c%c%c\x00' |
| b'Jello, world!\x00' |
| b'%d\x00' |
| b'%ld\x00' |
| b'%s there are %x (%.2f) of them%c\x00' |
| b'Hello %s! %hd %e\x00' |
| b'%u %d\x00' |
| b'The answer is: %s\x00' |
| b'%llx\x00' |
| b'Jello!\x00' |
| b'%x%lld%1.2f%s\x00' |
| b'Jello?\x00' |
| b'%llu\x00' |
| b'Won\'t fit : %s%d\x00') |
| |
| INVALID_CSV = """\ |
| 1,,"Whoa there!" |
| 2,this is totally invalid,"Whoa there!" |
| 3,,"This one's OK" |
| ,,"Also broken" |
| 5,1845-2-2,"I'm %s fine" |
| 6,"Missing fields" |
| """ |
| |
| |
| def read_db_from_csv(csv_str): |
| with io.StringIO(csv_str) as csv_db: |
| return tokens.Database(tokens.parse_csv(csv_db)) |
| |
| |
| class TokenDatabaseTest(unittest.TestCase): |
| """Tests the token database class.""" |
| def test_csv(self): |
| db = read_db_from_csv(CSV_DATABASE) |
| self.assertEqual(str(db), CSV_DATABASE) |
| |
| db = read_db_from_csv('') |
| self.assertEqual(str(db), '') |
| |
| def test_csv_formatting(self): |
| db = read_db_from_csv('') |
| self.assertEqual(str(db), '') |
| |
| db = read_db_from_csv('abc123,2048-4-1,Fake string\n') |
| self.assertEqual(str(db), '00abc123,2048-04-01,"Fake string"\n') |
| |
| db = read_db_from_csv('1,1990-01-01,"Quotes"""\n' |
| '0,1990-02-01,"Commas,"",,"\n') |
| self.assertEqual(str(db), ('00000000,1990-02-01,"Commas,"",,"\n' |
| '00000001,1990-01-01,"Quotes"""\n')) |
| |
| def test_bad_csv(self): |
| with self.assertLogs(_LOG, logging.ERROR) as logs: |
| db = read_db_from_csv(INVALID_CSV) |
| |
| self.assertGreaterEqual(len(logs.output), 3) |
| self.assertEqual(len(db.token_to_entries), 3) |
| |
| self.assertEqual(db.token_to_entries[1][0].string, 'Whoa there!') |
| self.assertFalse(db.token_to_entries[2]) |
| self.assertEqual(db.token_to_entries[3][0].string, "This one's OK") |
| self.assertFalse(db.token_to_entries[4]) |
| self.assertEqual(db.token_to_entries[5][0].string, "I'm %s fine") |
| self.assertFalse(db.token_to_entries[6]) |
| |
| def test_lookup(self): |
| db = read_db_from_csv(CSV_DATABASE) |
| self.assertEqual(db.token_to_entries[0x9999], []) |
| |
| matches = db.token_to_entries[0x2e668cd6] |
| self.assertEqual(len(matches), 1) |
| jello = matches[0] |
| |
| self.assertEqual(jello.token, 0x2e668cd6) |
| self.assertEqual(jello.string, 'Jello, world!') |
| self.assertEqual(jello.date_removed, datetime.datetime(2019, 6, 11)) |
| |
| matches = db.token_to_entries[0xe13b0f94] |
| self.assertEqual(len(matches), 1) |
| llu = matches[0] |
| self.assertEqual(llu.token, 0xe13b0f94) |
| self.assertEqual(llu.string, '%llu') |
| self.assertIsNone(llu.date_removed) |
| |
| answer, = db.token_to_entries[0x141c35d5] |
| self.assertEqual(answer.string, 'The answer: "%s"') |
| |
| def test_collisions(self): |
| hash_1 = tokens.pw_tokenizer_65599_fixed_length_hash('o000', 96) |
| hash_2 = tokens.pw_tokenizer_65599_fixed_length_hash('0Q1Q', 96) |
| self.assertEqual(hash_1, hash_2) |
| |
| db = tokens.Database.from_strings(['o000', '0Q1Q']) |
| |
| self.assertEqual(len(db.token_to_entries[hash_1]), 2) |
| self.assertCountEqual( |
| [entry.string for entry in db.token_to_entries[hash_1]], |
| ['o000', '0Q1Q']) |
| |
| def test_purge(self): |
| db = read_db_from_csv(CSV_DATABASE) |
| original_length = len(db.token_to_entries) |
| |
| self.assertEqual(db.token_to_entries[0][0].string, '') |
| self.assertEqual(db.token_to_entries[0x31631781][0].string, '%d') |
| self.assertEqual(db.token_to_entries[0x2e668cd6][0].string, |
| 'Jello, world!') |
| self.assertEqual(db.token_to_entries[0xb3653e13][0].string, 'Jello!') |
| self.assertEqual(db.token_to_entries[0xcc6d3131][0].string, 'Jello?') |
| self.assertEqual(db.token_to_entries[0xe65aefef][0].string, |
| "Won't fit : %s%d") |
| |
| db.purge(datetime.datetime(2019, 6, 11)) |
| self.assertLess(len(db.token_to_entries), original_length) |
| |
| self.assertFalse(db.token_to_entries[0]) |
| self.assertEqual(db.token_to_entries[0x31631781][0].string, '%d') |
| self.assertFalse(db.token_to_entries[0x2e668cd6]) |
| self.assertEqual(db.token_to_entries[0xb3653e13][0].string, 'Jello!') |
| self.assertEqual(db.token_to_entries[0xcc6d3131][0].string, 'Jello?') |
| self.assertFalse(db.token_to_entries[0xe65aefef]) |
| |
| def test_merge(self): |
| """Tests the tokens.Database merge method.""" |
| |
| db = tokens.Database() |
| |
| # Test basic merging into an empty database. |
| db.merge( |
| tokens.Database([ |
| tokens.TokenizedStringEntry(1, 'one', datetime.datetime.min), |
| tokens.TokenizedStringEntry(2, 'two', datetime.datetime.min), |
| ])) |
| self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'}) |
| self.assertEqual(db.token_to_entries[1][0].date_removed, |
| datetime.datetime.min) |
| self.assertEqual(db.token_to_entries[2][0].date_removed, |
| datetime.datetime.min) |
| |
| # Test merging in an entry with a removal date. |
| db.merge( |
| tokens.Database([ |
| tokens.TokenizedStringEntry(3, 'three'), |
| tokens.TokenizedStringEntry(4, 'four', datetime.datetime.min), |
| ])) |
| self.assertEqual({str(e) |
| for e in db.entries()}, |
| {'one', 'two', 'three', 'four'}) |
| self.assertIsNone(db.token_to_entries[3][0].date_removed) |
| self.assertEqual(db.token_to_entries[4][0].date_removed, |
| datetime.datetime.min) |
| |
| # Test merging in one entry. |
| db.merge(tokens.Database([ |
| tokens.TokenizedStringEntry(5, 'five'), |
| ])) |
| self.assertEqual({str(e) |
| for e in db.entries()}, |
| {'one', 'two', 'three', 'four', 'five'}) |
| self.assertEqual(db.token_to_entries[4][0].date_removed, |
| datetime.datetime.min) |
| self.assertIsNone(db.token_to_entries[5][0].date_removed) |
| |
| # Merge in repeated entries different removal dates. |
| db.merge( |
| tokens.Database([ |
| tokens.TokenizedStringEntry(4, 'four', datetime.datetime.max), |
| tokens.TokenizedStringEntry(5, 'five', datetime.datetime.max), |
| ])) |
| self.assertEqual(len(db.entries()), 5) |
| self.assertEqual({str(e) |
| for e in db.entries()}, |
| {'one', 'two', 'three', 'four', 'five'}) |
| self.assertEqual(db.token_to_entries[4][0].date_removed, |
| datetime.datetime.max) |
| self.assertIsNone(db.token_to_entries[5][0].date_removed) |
| |
| # Merge in the same repeated entries now without removal dates. |
| db.merge( |
| tokens.Database([ |
| tokens.TokenizedStringEntry(4, 'four'), |
| tokens.TokenizedStringEntry(5, 'five') |
| ])) |
| self.assertEqual(len(db.entries()), 5) |
| self.assertEqual({str(e) |
| for e in db.entries()}, |
| {'one', 'two', 'three', 'four', 'five'}) |
| self.assertIsNone(db.token_to_entries[4][0].date_removed) |
| self.assertIsNone(db.token_to_entries[5][0].date_removed) |
| |
| # Merge in an empty databsse. |
| db.merge(tokens.Database([])) |
| self.assertEqual({str(e) |
| for e in db.entries()}, |
| {'one', 'two', 'three', 'four', 'five'}) |
| |
| def test_merge_multiple(self): |
| db = tokens.Database.merged( |
| tokens.Database( |
| [tokens.TokenizedStringEntry(1, 'one', |
| datetime.datetime.max)]), |
| tokens.Database( |
| [tokens.TokenizedStringEntry(2, 'two', |
| datetime.datetime.min)]), |
| tokens.Database( |
| [tokens.TokenizedStringEntry(1, 'one', |
| datetime.datetime.min)])) |
| self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'}) |
| |
| db.merge( |
| tokens.Database([ |
| tokens.TokenizedStringEntry(4, 'four', datetime.datetime.max) |
| ]), |
| tokens.Database( |
| [tokens.TokenizedStringEntry(2, 'two', |
| datetime.datetime.max)]), |
| tokens.Database([ |
| tokens.TokenizedStringEntry(3, 'three', datetime.datetime.min) |
| ])) |
| self.assertEqual({str(e) |
| for e in db.entries()}, |
| {'one', 'two', 'three', 'four'}) |
| |
| def test_entry_counts(self): |
| self.assertEqual(len(CSV_DATABASE.splitlines()), 16) |
| |
| db = read_db_from_csv(CSV_DATABASE) |
| self.assertEqual(len(db.entries()), 16) |
| self.assertEqual(len(db.token_to_entries), 16) |
| |
| # Add two strings with the same hash. |
| db.add(['o000', '0Q1Q']) |
| |
| self.assertEqual(len(db.entries()), 18) |
| self.assertEqual(len(db.token_to_entries), 17) |
| |
| def test_mark_removals(self): |
| db = tokens.Database.from_strings( |
| ['MILK', 'apples', 'oranges', 'CHEESE', 'pears']) |
| |
| self.assertTrue( |
| all(entry.date_removed is None for entry in db.entries())) |
| date_1 = datetime.datetime(1, 2, 3) |
| |
| db.mark_removals(['apples', 'oranges', 'pears'], date_1) |
| |
| self.assertEqual( |
| db.token_to_entries[db.tokenize('MILK')][0].date_removed, date_1) |
| self.assertEqual( |
| db.token_to_entries[db.tokenize('CHEESE')][0].date_removed, date_1) |
| |
| now = datetime.datetime.now() |
| db.mark_removals(['MILK', 'CHEESE', 'pears']) |
| |
| # New strings are not added or re-added in mark_removed(). |
| self.assertGreaterEqual( |
| db.token_to_entries[db.tokenize('MILK')][0].date_removed, date_1) |
| self.assertGreaterEqual( |
| db.token_to_entries[db.tokenize('CHEESE')][0].date_removed, date_1) |
| |
| # These strings were removed. |
| self.assertGreaterEqual( |
| db.token_to_entries[db.tokenize('apples')][0].date_removed, now) |
| self.assertGreaterEqual( |
| db.token_to_entries[db.tokenize('oranges')][0].date_removed, now) |
| self.assertIsNone( |
| db.token_to_entries[db.tokenize('pears')][0].date_removed) |
| |
| def test_add(self): |
| db = tokens.Database() |
| db.add(['MILK', 'apples']) |
| self.assertEqual({e.string for e in db.entries()}, {'MILK', 'apples'}) |
| |
| db.add(['oranges', 'CHEESE', 'pears']) |
| self.assertEqual(len(db.entries()), 5) |
| |
| db.add(['MILK', 'apples', 'only this one is new']) |
| self.assertEqual(len(db.entries()), 6) |
| |
| db.add(['MILK']) |
| self.assertEqual({e.string |
| for e in db.entries()}, { |
| 'MILK', 'apples', 'oranges', 'CHEESE', 'pears', |
| 'only this one is new' |
| }) |
| |
| def test_binary_format_write(self): |
| db = read_db_from_csv(CSV_DATABASE) |
| |
| with io.BytesIO() as fd: |
| tokens.write_binary(db, fd) |
| binary_db = fd.getvalue() |
| |
| self.assertEqual(BINARY_DATABASE, binary_db) |
| |
| def test_binary_format_parse(self): |
| with io.BytesIO(BINARY_DATABASE) as binary_db: |
| db = tokens.Database(tokens.parse_binary(binary_db)) |
| |
| self.assertEqual(str(db), CSV_DATABASE) |
| |
| |
| class TestFilter(unittest.TestCase): |
| """Tests the filtering functionality.""" |
| def setUp(self): |
| super().setUp() |
| |
| self.db = tokens.Database([ |
| tokens.TokenizedStringEntry(1, 'Luke'), |
| tokens.TokenizedStringEntry(2, 'Leia'), |
| tokens.TokenizedStringEntry(2, 'Darth Vader'), |
| tokens.TokenizedStringEntry(2, 'Emperor Palpatine'), |
| tokens.TokenizedStringEntry(3, 'Han'), |
| tokens.TokenizedStringEntry(4, 'Chewbacca'), |
| tokens.TokenizedStringEntry(5, 'Darth Maul'), |
| tokens.TokenizedStringEntry(6, 'Han Solo'), |
| ]) |
| |
| def test_filter_include_single_regex(self): |
| self.db.filter(include=[' ']) # anything with a space |
| self.assertEqual( |
| set(e.string for e in self.db.entries()), |
| {'Darth Vader', 'Emperor Palpatine', 'Darth Maul', 'Han Solo'}) |
| |
| def test_filter_include_multiple_regexes(self): |
| self.db.filter(include=['Darth', 'cc', '^Han$']) |
| self.assertEqual(set(e.string for e in self.db.entries()), |
| {'Darth Vader', 'Darth Maul', 'Han', 'Chewbacca'}) |
| |
| def test_filter_include_no_matches(self): |
| self.db.filter(include=['Gandalf']) |
| self.assertFalse(self.db.entries()) |
| |
| def test_filter_exclude_single_regex(self): |
| self.db.filter(exclude=['^[^L]']) |
| self.assertEqual(set(e.string for e in self.db.entries()), |
| {'Luke', 'Leia'}) |
| |
| def test_filter_exclude_multiple_regexes(self): |
| self.db.filter(exclude=[' ', 'Han', 'Chewbacca']) |
| self.assertEqual(set(e.string for e in self.db.entries()), |
| {'Luke', 'Leia'}) |
| |
| def test_filter_exclude_no_matches(self): |
| self.db.filter(exclude=['.*']) |
| self.assertFalse(self.db.entries()) |
| |
| def test_filter_include_and_exclude(self): |
| self.db.filter(include=[' '], exclude=['Darth', 'Emperor']) |
| self.assertEqual(set(e.string for e in self.db.entries()), |
| {'Han Solo'}) |
| |
| def test_filter_neither_include_nor_exclude(self): |
| self.db.filter() |
| self.assertEqual( |
| set(e.string for e in self.db.entries()), { |
| 'Luke', 'Leia', 'Darth Vader', 'Emperor Palpatine', 'Han', |
| 'Chewbacca', 'Darth Maul', 'Han Solo' |
| }) |
| |
| |
| if __name__ == '__main__': |
| unittest.main() |