blob: c80a9f792145a08d7e14456ce70192edbe1407a3 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2020 The Pigweed Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
"""Tests for the tokens module."""
import datetime
import io
import logging
import unittest
from pw_tokenizer import tokens
from pw_tokenizer.tokens import _LOG
CSV_DATABASE = '''\
00000000,2019-06-10,""
141c35d5, ,"The answer: ""%s"""
2db1515f, ,"%u%d%02x%X%hu%hhu%d%ld%lu%lld%llu%c%c%c"
2e668cd6,2019-06-11,"Jello, world!"
31631781, ,"%d"
61fd1e26, ,"%ld"
68ab92da, ,"%s there are %x (%.2f) of them%c"
7b940e2a, ,"Hello %s! %hd %e"
851beeb6, ,"%u %d"
881436a0, ,"The answer is: %s"
ad002c97, ,"%llx"
b3653e13,2019-06-12,"Jello!"
b912567b, ,"%x%lld%1.2f%s"
cc6d3131,2020-01-01,"Jello?"
e13b0f94, ,"%llu"
e65aefef,2019-06-10,"Won't fit : %s%d"
'''
# The date 2019-06-10 is 07E3-06-0A in hex. In database order, it's 0A 06 E3 07.
BINARY_DATABASE = (
b'TOKENS\x00\x00\x10\x00\x00\x00\0\0\0\0' # header (0x10 entries)
b'\x00\x00\x00\x00\x0a\x06\xe3\x07' # 0x01
b'\xd5\x35\x1c\x14\xff\xff\xff\xff' # 0x02
b'\x5f\x51\xb1\x2d\xff\xff\xff\xff' # 0x03
b'\xd6\x8c\x66\x2e\x0b\x06\xe3\x07' # 0x04
b'\x81\x17\x63\x31\xff\xff\xff\xff' # 0x05
b'\x26\x1e\xfd\x61\xff\xff\xff\xff' # 0x06
b'\xda\x92\xab\x68\xff\xff\xff\xff' # 0x07
b'\x2a\x0e\x94\x7b\xff\xff\xff\xff' # 0x08
b'\xb6\xee\x1b\x85\xff\xff\xff\xff' # 0x09
b'\xa0\x36\x14\x88\xff\xff\xff\xff' # 0x0a
b'\x97\x2c\x00\xad\xff\xff\xff\xff' # 0x0b
b'\x13\x3e\x65\xb3\x0c\x06\xe3\x07' # 0x0c
b'\x7b\x56\x12\xb9\xff\xff\xff\xff' # 0x0d
b'\x31\x31\x6d\xcc\x01\x01\xe4\x07' # 0x0e
b'\x94\x0f\x3b\xe1\xff\xff\xff\xff' # 0x0f
b'\xef\xef\x5a\xe6\x0a\x06\xe3\x07' # 0x10
b'\x00'
b'The answer: "%s"\x00'
b'%u%d%02x%X%hu%hhu%d%ld%lu%lld%llu%c%c%c\x00'
b'Jello, world!\x00'
b'%d\x00'
b'%ld\x00'
b'%s there are %x (%.2f) of them%c\x00'
b'Hello %s! %hd %e\x00'
b'%u %d\x00'
b'The answer is: %s\x00'
b'%llx\x00'
b'Jello!\x00'
b'%x%lld%1.2f%s\x00'
b'Jello?\x00'
b'%llu\x00'
b'Won\'t fit : %s%d\x00')
INVALID_CSV = """\
1,,"Whoa there!"
2,this is totally invalid,"Whoa there!"
3,,"This one's OK"
,,"Also broken"
5,1845-2-2,"I'm %s fine"
6,"Missing fields"
"""
def read_db_from_csv(csv_str):
with io.StringIO(csv_str) as csv_db:
return tokens.Database(tokens.parse_csv(csv_db))
class TokenDatabaseTest(unittest.TestCase):
"""Tests the token database class."""
def test_csv(self):
db = read_db_from_csv(CSV_DATABASE)
self.assertEqual(str(db), CSV_DATABASE)
db = read_db_from_csv('')
self.assertEqual(str(db), '')
def test_csv_formatting(self):
db = read_db_from_csv('')
self.assertEqual(str(db), '')
db = read_db_from_csv('abc123,2048-4-1,Fake string\n')
self.assertEqual(str(db), '00abc123,2048-04-01,"Fake string"\n')
db = read_db_from_csv('1,1990-01-01,"Quotes"""\n'
'0,1990-02-01,"Commas,"",,"\n')
self.assertEqual(str(db), ('00000000,1990-02-01,"Commas,"",,"\n'
'00000001,1990-01-01,"Quotes"""\n'))
def test_bad_csv(self):
with self.assertLogs(_LOG, logging.ERROR) as logs:
db = read_db_from_csv(INVALID_CSV)
self.assertGreaterEqual(len(logs.output), 3)
self.assertEqual(len(db.token_to_entries), 3)
self.assertEqual(db.token_to_entries[1][0].string, 'Whoa there!')
self.assertFalse(db.token_to_entries[2])
self.assertEqual(db.token_to_entries[3][0].string, "This one's OK")
self.assertFalse(db.token_to_entries[4])
self.assertEqual(db.token_to_entries[5][0].string, "I'm %s fine")
self.assertFalse(db.token_to_entries[6])
def test_lookup(self):
db = read_db_from_csv(CSV_DATABASE)
self.assertEqual(db.token_to_entries[0x9999], [])
matches = db.token_to_entries[0x2e668cd6]
self.assertEqual(len(matches), 1)
jello = matches[0]
self.assertEqual(jello.token, 0x2e668cd6)
self.assertEqual(jello.string, 'Jello, world!')
self.assertEqual(jello.date_removed, datetime.datetime(2019, 6, 11))
matches = db.token_to_entries[0xe13b0f94]
self.assertEqual(len(matches), 1)
llu = matches[0]
self.assertEqual(llu.token, 0xe13b0f94)
self.assertEqual(llu.string, '%llu')
self.assertIsNone(llu.date_removed)
answer, = db.token_to_entries[0x141c35d5]
self.assertEqual(answer.string, 'The answer: "%s"')
def test_collisions(self):
hash_1 = tokens.pw_tokenizer_65599_fixed_length_hash('o000', 96)
hash_2 = tokens.pw_tokenizer_65599_fixed_length_hash('0Q1Q', 96)
self.assertEqual(hash_1, hash_2)
db = tokens.Database.from_strings(['o000', '0Q1Q'])
self.assertEqual(len(db.token_to_entries[hash_1]), 2)
self.assertCountEqual(
[entry.string for entry in db.token_to_entries[hash_1]],
['o000', '0Q1Q'])
def test_purge(self):
db = read_db_from_csv(CSV_DATABASE)
original_length = len(db.token_to_entries)
self.assertEqual(db.token_to_entries[0][0].string, '')
self.assertEqual(db.token_to_entries[0x31631781][0].string, '%d')
self.assertEqual(db.token_to_entries[0x2e668cd6][0].string,
'Jello, world!')
self.assertEqual(db.token_to_entries[0xb3653e13][0].string, 'Jello!')
self.assertEqual(db.token_to_entries[0xcc6d3131][0].string, 'Jello?')
self.assertEqual(db.token_to_entries[0xe65aefef][0].string,
"Won't fit : %s%d")
db.purge(datetime.datetime(2019, 6, 11))
self.assertLess(len(db.token_to_entries), original_length)
self.assertFalse(db.token_to_entries[0])
self.assertEqual(db.token_to_entries[0x31631781][0].string, '%d')
self.assertFalse(db.token_to_entries[0x2e668cd6])
self.assertEqual(db.token_to_entries[0xb3653e13][0].string, 'Jello!')
self.assertEqual(db.token_to_entries[0xcc6d3131][0].string, 'Jello?')
self.assertFalse(db.token_to_entries[0xe65aefef])
def test_merge(self):
"""Tests the tokens.Database merge method."""
db = tokens.Database()
# Test basic merging into an empty database.
db.merge(
tokens.Database([
tokens.TokenizedStringEntry(1, 'one', datetime.datetime.min),
tokens.TokenizedStringEntry(2, 'two', datetime.datetime.min),
]))
self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'})
self.assertEqual(db.token_to_entries[1][0].date_removed,
datetime.datetime.min)
self.assertEqual(db.token_to_entries[2][0].date_removed,
datetime.datetime.min)
# Test merging in an entry with a removal date.
db.merge(
tokens.Database([
tokens.TokenizedStringEntry(3, 'three'),
tokens.TokenizedStringEntry(4, 'four', datetime.datetime.min),
]))
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four'})
self.assertIsNone(db.token_to_entries[3][0].date_removed)
self.assertEqual(db.token_to_entries[4][0].date_removed,
datetime.datetime.min)
# Test merging in one entry.
db.merge(tokens.Database([
tokens.TokenizedStringEntry(5, 'five'),
]))
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four', 'five'})
self.assertEqual(db.token_to_entries[4][0].date_removed,
datetime.datetime.min)
self.assertIsNone(db.token_to_entries[5][0].date_removed)
# Merge in repeated entries different removal dates.
db.merge(
tokens.Database([
tokens.TokenizedStringEntry(4, 'four', datetime.datetime.max),
tokens.TokenizedStringEntry(5, 'five', datetime.datetime.max),
]))
self.assertEqual(len(db.entries()), 5)
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four', 'five'})
self.assertEqual(db.token_to_entries[4][0].date_removed,
datetime.datetime.max)
self.assertIsNone(db.token_to_entries[5][0].date_removed)
# Merge in the same repeated entries now without removal dates.
db.merge(
tokens.Database([
tokens.TokenizedStringEntry(4, 'four'),
tokens.TokenizedStringEntry(5, 'five')
]))
self.assertEqual(len(db.entries()), 5)
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four', 'five'})
self.assertIsNone(db.token_to_entries[4][0].date_removed)
self.assertIsNone(db.token_to_entries[5][0].date_removed)
# Merge in an empty databsse.
db.merge(tokens.Database([]))
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four', 'five'})
def test_merge_multiple(self):
db = tokens.Database.merged(
tokens.Database(
[tokens.TokenizedStringEntry(1, 'one',
datetime.datetime.max)]),
tokens.Database(
[tokens.TokenizedStringEntry(2, 'two',
datetime.datetime.min)]),
tokens.Database(
[tokens.TokenizedStringEntry(1, 'one',
datetime.datetime.min)]))
self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'})
db.merge(
tokens.Database([
tokens.TokenizedStringEntry(4, 'four', datetime.datetime.max)
]),
tokens.Database(
[tokens.TokenizedStringEntry(2, 'two',
datetime.datetime.max)]),
tokens.Database([
tokens.TokenizedStringEntry(3, 'three', datetime.datetime.min)
]))
self.assertEqual({str(e)
for e in db.entries()},
{'one', 'two', 'three', 'four'})
def test_entry_counts(self):
self.assertEqual(len(CSV_DATABASE.splitlines()), 16)
db = read_db_from_csv(CSV_DATABASE)
self.assertEqual(len(db.entries()), 16)
self.assertEqual(len(db.token_to_entries), 16)
# Add two strings with the same hash.
db.add(['o000', '0Q1Q'])
self.assertEqual(len(db.entries()), 18)
self.assertEqual(len(db.token_to_entries), 17)
def test_mark_removals(self):
db = tokens.Database.from_strings(
['MILK', 'apples', 'oranges', 'CHEESE', 'pears'])
self.assertTrue(
all(entry.date_removed is None for entry in db.entries()))
date_1 = datetime.datetime(1, 2, 3)
db.mark_removals(['apples', 'oranges', 'pears'], date_1)
self.assertEqual(
db.token_to_entries[db.tokenize('MILK')][0].date_removed, date_1)
self.assertEqual(
db.token_to_entries[db.tokenize('CHEESE')][0].date_removed, date_1)
now = datetime.datetime.now()
db.mark_removals(['MILK', 'CHEESE', 'pears'])
# New strings are not added or re-added in mark_removed().
self.assertGreaterEqual(
db.token_to_entries[db.tokenize('MILK')][0].date_removed, date_1)
self.assertGreaterEqual(
db.token_to_entries[db.tokenize('CHEESE')][0].date_removed, date_1)
# These strings were removed.
self.assertGreaterEqual(
db.token_to_entries[db.tokenize('apples')][0].date_removed, now)
self.assertGreaterEqual(
db.token_to_entries[db.tokenize('oranges')][0].date_removed, now)
self.assertIsNone(
db.token_to_entries[db.tokenize('pears')][0].date_removed)
def test_add(self):
db = tokens.Database()
db.add(['MILK', 'apples'])
self.assertEqual({e.string for e in db.entries()}, {'MILK', 'apples'})
db.add(['oranges', 'CHEESE', 'pears'])
self.assertEqual(len(db.entries()), 5)
db.add(['MILK', 'apples', 'only this one is new'])
self.assertEqual(len(db.entries()), 6)
db.add(['MILK'])
self.assertEqual({e.string
for e in db.entries()}, {
'MILK', 'apples', 'oranges', 'CHEESE', 'pears',
'only this one is new'
})
def test_binary_format_write(self):
db = read_db_from_csv(CSV_DATABASE)
with io.BytesIO() as fd:
tokens.write_binary(db, fd)
binary_db = fd.getvalue()
self.assertEqual(BINARY_DATABASE, binary_db)
def test_binary_format_parse(self):
with io.BytesIO(BINARY_DATABASE) as binary_db:
db = tokens.Database(tokens.parse_binary(binary_db))
self.assertEqual(str(db), CSV_DATABASE)
class TestFilter(unittest.TestCase):
"""Tests the filtering functionality."""
def setUp(self):
super().setUp()
self.db = tokens.Database([
tokens.TokenizedStringEntry(1, 'Luke'),
tokens.TokenizedStringEntry(2, 'Leia'),
tokens.TokenizedStringEntry(2, 'Darth Vader'),
tokens.TokenizedStringEntry(2, 'Emperor Palpatine'),
tokens.TokenizedStringEntry(3, 'Han'),
tokens.TokenizedStringEntry(4, 'Chewbacca'),
tokens.TokenizedStringEntry(5, 'Darth Maul'),
tokens.TokenizedStringEntry(6, 'Han Solo'),
])
def test_filter_include_single_regex(self):
self.db.filter(include=[' ']) # anything with a space
self.assertEqual(
set(e.string for e in self.db.entries()),
{'Darth Vader', 'Emperor Palpatine', 'Darth Maul', 'Han Solo'})
def test_filter_include_multiple_regexes(self):
self.db.filter(include=['Darth', 'cc', '^Han$'])
self.assertEqual(set(e.string for e in self.db.entries()),
{'Darth Vader', 'Darth Maul', 'Han', 'Chewbacca'})
def test_filter_include_no_matches(self):
self.db.filter(include=['Gandalf'])
self.assertFalse(self.db.entries())
def test_filter_exclude_single_regex(self):
self.db.filter(exclude=['^[^L]'])
self.assertEqual(set(e.string for e in self.db.entries()),
{'Luke', 'Leia'})
def test_filter_exclude_multiple_regexes(self):
self.db.filter(exclude=[' ', 'Han', 'Chewbacca'])
self.assertEqual(set(e.string for e in self.db.entries()),
{'Luke', 'Leia'})
def test_filter_exclude_no_matches(self):
self.db.filter(exclude=['.*'])
self.assertFalse(self.db.entries())
def test_filter_include_and_exclude(self):
self.db.filter(include=[' '], exclude=['Darth', 'Emperor'])
self.assertEqual(set(e.string for e in self.db.entries()),
{'Han Solo'})
def test_filter_neither_include_nor_exclude(self):
self.db.filter()
self.assertEqual(
set(e.string for e in self.db.entries()), {
'Luke', 'Leia', 'Darth Vader', 'Emperor Palpatine', 'Han',
'Chewbacca', 'Darth Maul', 'Han Solo'
})
if __name__ == '__main__':
unittest.main()