blob: 6a301e61811e417ef15bcd8cf4b103c14c5a58fc [file] [log] [blame]
Ben Olmsteadc0d77842019-07-31 17:34:05 -07001# Copyright 2019 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Tests for tokenizer."""
16
17import unittest
reventlov6731fc42019-10-03 15:23:13 -070018from compiler.front_end import tokenizer
19from compiler.util import error
20from compiler.util import parser_types
Ben Olmsteadc0d77842019-07-31 17:34:05 -070021
22
23def _token_symbols(token_list):
Dmitri Prime495d3f22024-09-06 16:56:59 -070024 """Given a list of tokens, returns a list of their symbol names."""
25 return [token.symbol for token in token_list]
Ben Olmsteadc0d77842019-07-31 17:34:05 -070026
27
28class TokenizerTest(unittest.TestCase):
Dmitri Prime495d3f22024-09-06 16:56:59 -070029 """Tests for the tokenizer.tokenize function."""
Ben Olmsteadc0d77842019-07-31 17:34:05 -070030
Dmitri Prime495d3f22024-09-06 16:56:59 -070031 def test_bad_indent_tab_versus_space(self):
32 # A bad indent is one that doesn't match a previous unmatched indent.
33 tokens, errors = tokenizer.tokenize(" a\n\tb", "file")
34 self.assertFalse(tokens)
35 self.assertEqual(
36 [
37 [
38 error.error(
39 "file",
40 parser_types.make_location((2, 1), (2, 2)),
41 "Bad indentation",
42 )
43 ]
44 ],
45 errors,
46 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -070047
Dmitri Prime495d3f22024-09-06 16:56:59 -070048 def test_bad_indent_tab_versus_eight_spaces(self):
49 tokens, errors = tokenizer.tokenize(" a\n\tb", "file")
50 self.assertFalse(tokens)
51 self.assertEqual(
52 [
53 [
54 error.error(
55 "file",
56 parser_types.make_location((2, 1), (2, 2)),
57 "Bad indentation",
58 )
59 ]
60 ],
61 errors,
62 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -070063
Dmitri Prime495d3f22024-09-06 16:56:59 -070064 def test_bad_indent_tab_versus_four_spaces(self):
65 tokens, errors = tokenizer.tokenize(" a\n\tb", "file")
66 self.assertFalse(tokens)
67 self.assertEqual(
68 [
69 [
70 error.error(
71 "file",
72 parser_types.make_location((2, 1), (2, 2)),
73 "Bad indentation",
74 )
75 ]
76 ],
77 errors,
78 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -070079
Dmitri Prime495d3f22024-09-06 16:56:59 -070080 def test_bad_indent_two_spaces_versus_one_space(self):
81 tokens, errors = tokenizer.tokenize(" a\n b", "file")
82 self.assertFalse(tokens)
83 self.assertEqual(
84 [
85 [
86 error.error(
87 "file",
88 parser_types.make_location((2, 1), (2, 2)),
89 "Bad indentation",
90 )
91 ]
92 ],
93 errors,
94 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -070095
Dmitri Prime495d3f22024-09-06 16:56:59 -070096 def test_bad_indent_matches_closed_indent(self):
97 tokens, errors = tokenizer.tokenize(" a\nb\n c\n d", "file")
98 self.assertFalse(tokens)
99 self.assertEqual(
100 [
101 [
102 error.error(
103 "file",
104 parser_types.make_location((4, 1), (4, 2)),
105 "Bad indentation",
106 )
107 ]
108 ],
109 errors,
110 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700111
Dmitri Prime495d3f22024-09-06 16:56:59 -0700112 def test_bad_string_after_string_with_escaped_backslash_at_end(self):
113 tokens, errors = tokenizer.tokenize(r'"\\""', "name")
114 self.assertFalse(tokens)
115 self.assertEqual(
116 [
117 [
118 error.error(
119 "name",
120 parser_types.make_location((1, 5), (1, 6)),
121 "Unrecognized token",
122 )
123 ]
124 ],
125 errors,
126 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700127
128
129def _make_short_token_match_tests():
Dmitri Prime495d3f22024-09-06 16:56:59 -0700130 """Makes tests for short, simple tokenization cases."""
131 eol = '"\\n"'
132 cases = {
133 "Cam": ["CamelWord", eol],
134 "Ca9": ["CamelWord", eol],
135 "CanB": ["CamelWord", eol],
136 "CanBee": ["CamelWord", eol],
137 "CBa": ["CamelWord", eol],
138 "cam": ["SnakeWord", eol],
139 "ca9": ["SnakeWord", eol],
140 "can_b": ["SnakeWord", eol],
141 "can_bee": ["SnakeWord", eol],
142 "c_ba": ["SnakeWord", eol],
143 "cba_": ["SnakeWord", eol],
144 "c_b_a_": ["SnakeWord", eol],
145 "CAM": ["ShoutyWord", eol],
146 "CA9": ["ShoutyWord", eol],
147 "CAN_B": ["ShoutyWord", eol],
148 "CAN_BEE": ["ShoutyWord", eol],
149 "C_BA": ["ShoutyWord", eol],
150 "C": ["BadWord", eol],
151 "C1": ["BadWord", eol],
152 "c": ["SnakeWord", eol],
153 "$": ["BadWord", eol],
154 "_": ["BadWord", eol],
155 "_a": ["BadWord", eol],
156 "_A": ["BadWord", eol],
157 "Cb_A": ["BadWord", eol],
158 "aCb": ["BadWord", eol],
159 "a b": ["SnakeWord", "SnakeWord", eol],
160 "a\tb": ["SnakeWord", "SnakeWord", eol],
161 "a \t b ": ["SnakeWord", "SnakeWord", eol],
162 " \t ": [eol],
163 "a #b": ["SnakeWord", "Comment", eol],
164 "a#": ["SnakeWord", "Comment", eol],
165 "# b": ["Comment", eol],
166 " # b": ["Comment", eol],
167 " #": ["Comment", eol],
168 "": [],
169 "\n": [eol],
170 "\na": [eol, "SnakeWord", eol],
171 "a--example": ["SnakeWord", "BadDocumentation", eol],
172 "a ---- example": ["SnakeWord", "BadDocumentation", eol],
173 "a --- example": ["SnakeWord", "BadDocumentation", eol],
174 "a-- example": ["SnakeWord", "Documentation", eol],
175 "a -- -- example": ["SnakeWord", "Documentation", eol],
176 "a -- - example": ["SnakeWord", "Documentation", eol],
177 "--": ["Documentation", eol],
178 "-- ": ["Documentation", eol],
179 "-- ": ["Documentation", eol],
180 "$default": ['"$default"', eol],
181 "$defaultx": ["BadWord", eol],
182 "$def": ["BadWord", eol],
183 "x$default": ["BadWord", eol],
184 "9$default": ["BadWord", eol],
185 "struct": ['"struct"', eol],
186 "external": ['"external"', eol],
187 "bits": ['"bits"', eol],
188 "enum": ['"enum"', eol],
189 "as": ['"as"', eol],
190 "import": ['"import"', eol],
191 "true": ["BooleanConstant", eol],
192 "false": ["BooleanConstant", eol],
193 "truex": ["SnakeWord", eol],
194 "falsex": ["SnakeWord", eol],
195 "structx": ["SnakeWord", eol],
196 "bitsx": ["SnakeWord", eol],
197 "enumx": ["SnakeWord", eol],
198 "0b": ["BadNumber", eol],
199 "0x": ["BadNumber", eol],
200 "0b011101": ["Number", eol],
201 "0b0": ["Number", eol],
202 "0b0111_1111_0000": ["Number", eol],
203 "0b00_000_00": ["BadNumber", eol],
204 "0b0_0_0": ["BadNumber", eol],
205 "0b0111012": ["BadNumber", eol],
206 "0b011101x": ["BadWord", eol],
207 "0b011101b": ["BadNumber", eol],
208 "0B0": ["BadNumber", eol],
209 "0X0": ["BadNumber", eol],
210 "0b_": ["BadNumber", eol],
211 "0x_": ["BadNumber", eol],
212 "0b__": ["BadNumber", eol],
213 "0x__": ["BadNumber", eol],
214 "0b_0000": ["Number", eol],
215 "0b0000_": ["BadNumber", eol],
216 "0b00_____00": ["BadNumber", eol],
217 "0x00_000_00": ["BadNumber", eol],
218 "0x0_0_0": ["BadNumber", eol],
219 "0b____0____": ["BadNumber", eol],
220 "0b00000000000000000000": ["Number", eol],
221 "0b_00000000": ["Number", eol],
222 "0b0000_0000_0000": ["Number", eol],
223 "0b000_0000_0000": ["Number", eol],
224 "0b00_0000_0000": ["Number", eol],
225 "0b0_0000_0000": ["Number", eol],
226 "0b_0000_0000_0000": ["Number", eol],
227 "0b_000_0000_0000": ["Number", eol],
228 "0b_00_0000_0000": ["Number", eol],
229 "0b_0_0000_0000": ["Number", eol],
230 "0b00000000_00000000_00000000": ["Number", eol],
231 "0b0000000_00000000_00000000": ["Number", eol],
232 "0b000000_00000000_00000000": ["Number", eol],
233 "0b00000_00000000_00000000": ["Number", eol],
234 "0b0000_00000000_00000000": ["Number", eol],
235 "0b000_00000000_00000000": ["Number", eol],
236 "0b00_00000000_00000000": ["Number", eol],
237 "0b0_00000000_00000000": ["Number", eol],
238 "0b_00000000_00000000_00000000": ["Number", eol],
239 "0b_0000000_00000000_00000000": ["Number", eol],
240 "0b_000000_00000000_00000000": ["Number", eol],
241 "0b_00000_00000000_00000000": ["Number", eol],
242 "0b_0000_00000000_00000000": ["Number", eol],
243 "0b_000_00000000_00000000": ["Number", eol],
244 "0b_00_00000000_00000000": ["Number", eol],
245 "0b_0_00000000_00000000": ["Number", eol],
246 "0x0": ["Number", eol],
247 "0x00000000000000000000": ["Number", eol],
248 "0x_0000": ["Number", eol],
249 "0x_00000000": ["Number", eol],
250 "0x0000_0000_0000": ["Number", eol],
251 "0x000_0000_0000": ["Number", eol],
252 "0x00_0000_0000": ["Number", eol],
253 "0x0_0000_0000": ["Number", eol],
254 "0x_0000_0000_0000": ["Number", eol],
255 "0x_000_0000_0000": ["Number", eol],
256 "0x_00_0000_0000": ["Number", eol],
257 "0x_0_0000_0000": ["Number", eol],
258 "0x00000000_00000000_00000000": ["Number", eol],
259 "0x0000000_00000000_00000000": ["Number", eol],
260 "0x000000_00000000_00000000": ["Number", eol],
261 "0x00000_00000000_00000000": ["Number", eol],
262 "0x0000_00000000_00000000": ["Number", eol],
263 "0x000_00000000_00000000": ["Number", eol],
264 "0x00_00000000_00000000": ["Number", eol],
265 "0x0_00000000_00000000": ["Number", eol],
266 "0x_00000000_00000000_00000000": ["Number", eol],
267 "0x_0000000_00000000_00000000": ["Number", eol],
268 "0x_000000_00000000_00000000": ["Number", eol],
269 "0x_00000_00000000_00000000": ["Number", eol],
270 "0x_0000_00000000_00000000": ["Number", eol],
271 "0x_000_00000000_00000000": ["Number", eol],
272 "0x_00_00000000_00000000": ["Number", eol],
273 "0x_0_00000000_00000000": ["Number", eol],
274 "0x__00000000_00000000": ["BadNumber", eol],
275 "0x00000000_00000000_0000": ["BadNumber", eol],
276 "0x00000000_0000_0000": ["BadNumber", eol],
277 "0x_00000000000000000000": ["BadNumber", eol],
278 "0b_00000000000000000000": ["BadNumber", eol],
279 "0b00000000_00000000_0000": ["BadNumber", eol],
280 "0b00000000_0000_0000": ["BadNumber", eol],
281 "0x0000_": ["BadNumber", eol],
282 "0x00_____00": ["BadNumber", eol],
283 "0x____0____": ["BadNumber", eol],
284 "EmbossReserved": ["BadWord", eol],
285 "EmbossReservedA": ["BadWord", eol],
286 "EmbossReserved_": ["BadWord", eol],
287 "EMBOSS_RESERVED": ["BadWord", eol],
288 "EMBOSS_RESERVED_": ["BadWord", eol],
289 "EMBOSS_RESERVEDA": ["BadWord", eol],
290 "emboss_reserved": ["BadWord", eol],
291 "emboss_reserved_": ["BadWord", eol],
292 "emboss_reserveda": ["BadWord", eol],
293 "0x0123456789abcdefABCDEF": ["Number", eol],
294 "0": ["Number", eol],
295 "1": ["Number", eol],
296 "1a": ["BadNumber", eol],
297 "1g": ["BadWord", eol],
298 "1234567890": ["Number", eol],
299 "1_234_567_890": ["Number", eol],
300 "234_567_890": ["Number", eol],
301 "34_567_890": ["Number", eol],
302 "4_567_890": ["Number", eol],
303 "1_2_3_4_5_6_7_8_9_0": ["BadNumber", eol],
304 "1234567890_": ["BadNumber", eol],
305 "1__234567890": ["BadNumber", eol],
306 "_1234567890": ["BadWord", eol],
307 "[]": ['"["', '"]"', eol],
308 "()": ['"("', '")"', eol],
309 "..": ['"."', '"."', eol],
310 "...": ['"."', '"."', '"."', eol],
311 "....": ['"."', '"."', '"."', '"."', eol],
312 '"abc"': ["String", eol],
313 '""': ["String", eol],
314 r'"\\"': ["String", eol],
315 r'"\""': ["String", eol],
316 r'"\n"': ["String", eol],
317 r'"\\n"': ["String", eol],
318 r'"\\xyz"': ["String", eol],
319 r'"\\\\"': ["String", eol],
320 }
321 for c in (
322 "[ ] ( ) ? : = + - * . == != < <= > >= && || , $max $present "
323 "$upper_bound $lower_bound $size_in_bits $size_in_bytes "
324 "$max_size_in_bits $max_size_in_bytes $min_size_in_bits "
325 "$min_size_in_bytes "
326 "$default struct bits enum external import as if let"
327 ).split():
328 cases[c] = ['"' + c + '"', eol]
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700329
Dmitri Prime495d3f22024-09-06 16:56:59 -0700330 def make_test_case(case):
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700331
Dmitri Prime495d3f22024-09-06 16:56:59 -0700332 def test_case(self):
333 tokens, errors = tokenizer.tokenize(case, "name")
334 symbols = _token_symbols(tokens)
335 self.assertFalse(errors)
336 self.assertEqual(symbols, cases[case])
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700337
Dmitri Prime495d3f22024-09-06 16:56:59 -0700338 return test_case
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700339
Dmitri Prime495d3f22024-09-06 16:56:59 -0700340 for c in cases:
341 setattr(TokenizerTest, "testShortTokenMatch{!r}".format(c), make_test_case(c))
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700342
343
344def _make_bad_char_tests():
Dmitri Prime495d3f22024-09-06 16:56:59 -0700345 """Makes tests that an error is returned for bad characters."""
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700346
Dmitri Prime495d3f22024-09-06 16:56:59 -0700347 def make_test_case(case):
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700348
Dmitri Prime495d3f22024-09-06 16:56:59 -0700349 def test_case(self):
350 tokens, errors = tokenizer.tokenize(case, "name")
351 self.assertFalse(tokens)
352 self.assertEqual(
353 [
354 [
355 error.error(
356 "name",
357 parser_types.make_location((1, 1), (1, 2)),
358 "Unrecognized token",
359 )
360 ]
361 ],
362 errors,
363 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700364
Dmitri Prime495d3f22024-09-06 16:56:59 -0700365 return test_case
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700366
Dmitri Prime495d3f22024-09-06 16:56:59 -0700367 for c in "~`!@%^&\\|;'\"/{}":
368 setattr(TokenizerTest, "testBadChar{!r}".format(c), make_test_case(c))
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700369
370
371def _make_bad_string_tests():
Dmitri Prime495d3f22024-09-06 16:56:59 -0700372 """Makes tests that an error is returned for bad strings."""
373 bad_strings = (r'"\"', '"\\\n"', r'"\\\"', r'"', r'"\q"', r'"\\\q"')
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700374
Dmitri Prime495d3f22024-09-06 16:56:59 -0700375 def make_test_case(string):
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700376
Dmitri Prime495d3f22024-09-06 16:56:59 -0700377 def test_case(self):
378 tokens, errors = tokenizer.tokenize(string, "name")
379 self.assertFalse(tokens)
380 self.assertEqual(
381 [
382 [
383 error.error(
384 "name",
385 parser_types.make_location((1, 1), (1, 2)),
386 "Unrecognized token",
387 )
388 ]
389 ],
390 errors,
391 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700392
Dmitri Prime495d3f22024-09-06 16:56:59 -0700393 return test_case
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700394
Dmitri Prime495d3f22024-09-06 16:56:59 -0700395 for s in bad_strings:
396 setattr(TokenizerTest, "testBadString{!r}".format(s), make_test_case(s))
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700397
398
399def _make_multiline_tests():
Dmitri Prime495d3f22024-09-06 16:56:59 -0700400 """Makes tests for indent/dedent insertion and eol insertion."""
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700401
Dmitri Prime495d3f22024-09-06 16:56:59 -0700402 c = "Comment"
403 eol = '"\\n"'
404 sw = "SnakeWord"
405 ind = "Indent"
406 ded = "Dedent"
407 cases = {
408 "a\nb\n": [sw, eol, sw, eol],
409 "a\n\nb\n": [sw, eol, eol, sw, eol],
410 "a\n#foo\nb\n": [sw, eol, c, eol, sw, eol],
411 "a\n #foo\nb\n": [sw, eol, c, eol, sw, eol],
412 "a\n b\n": [sw, eol, ind, sw, eol, ded],
413 "a\n b\n\n": [sw, eol, ind, sw, eol, eol, ded],
414 "a\n b\n c\n": [sw, eol, ind, sw, eol, ind, sw, eol, ded, ded],
415 "a\n b\n c\n": [sw, eol, ind, sw, eol, sw, eol, ded],
416 "a\n b\n\n c\n": [sw, eol, ind, sw, eol, eol, sw, eol, ded],
417 "a\n b\n #\n c\n": [sw, eol, ind, sw, eol, c, eol, sw, eol, ded],
418 "a\n\tb\n #\n\tc\n": [sw, eol, ind, sw, eol, c, eol, sw, eol, ded],
419 " a\n b\n c\n d\n": [
420 ind,
421 sw,
422 eol,
423 ind,
424 sw,
425 eol,
426 ind,
427 sw,
428 eol,
429 ded,
430 ded,
431 sw,
432 eol,
433 ded,
434 ],
435 }
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700436
Dmitri Prime495d3f22024-09-06 16:56:59 -0700437 def make_test_case(case):
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700438
Dmitri Prime495d3f22024-09-06 16:56:59 -0700439 def test_case(self):
440 tokens, errors = tokenizer.tokenize(case, "file")
441 self.assertFalse(errors)
442 self.assertEqual(_token_symbols(tokens), cases[case])
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700443
Dmitri Prime495d3f22024-09-06 16:56:59 -0700444 return test_case
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700445
Dmitri Prime495d3f22024-09-06 16:56:59 -0700446 for c in cases:
447 setattr(TokenizerTest, "testMultiline{!r}".format(c), make_test_case(c))
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700448
449
450def _make_offset_tests():
Dmitri Prime495d3f22024-09-06 16:56:59 -0700451 """Makes tests that the tokenizer fills in correct source locations."""
452 cases = {
453 "a+": ["1:1-1:2", "1:2-1:3", "1:3-1:3"],
454 "a + ": ["1:1-1:2", "1:5-1:6", "1:9-1:9"],
455 "a\n\nb": ["1:1-1:2", "1:2-1:2", "2:1-2:1", "3:1-3:2", "3:2-3:2"],
456 "a\n b": ["1:1-1:2", "1:2-1:2", "2:1-2:3", "2:3-2:4", "2:4-2:4", "3:1-3:1"],
457 "a\n b\nc": [
458 "1:1-1:2",
459 "1:2-1:2",
460 "2:1-2:3",
461 "2:3-2:4",
462 "2:4-2:4",
463 "3:1-3:1",
464 "3:1-3:2",
465 "3:2-3:2",
466 ],
467 "a\n b\n c": [
468 "1:1-1:2",
469 "1:2-1:2",
470 "2:1-2:2",
471 "2:2-2:3",
472 "2:3-2:3",
473 "3:2-3:3",
474 "3:3-3:4",
475 "3:4-3:4",
476 "4:1-4:1",
477 "4:1-4:1",
478 ],
479 }
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700480
Dmitri Prime495d3f22024-09-06 16:56:59 -0700481 def make_test_case(case):
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700482
Dmitri Prime495d3f22024-09-06 16:56:59 -0700483 def test_case(self):
484 self.assertEqual(
485 [
486 parser_types.format_location(l.source_location)
487 for l in tokenizer.tokenize(case, "file")[0]
488 ],
489 cases[case],
490 )
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700491
Dmitri Prime495d3f22024-09-06 16:56:59 -0700492 return test_case
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700493
Dmitri Prime495d3f22024-09-06 16:56:59 -0700494 for c in cases:
495 setattr(TokenizerTest, "testOffset{!r}".format(c), make_test_case(c))
496
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700497
498_make_short_token_match_tests()
499_make_bad_char_tests()
500_make_bad_string_tests()
501_make_multiline_tests()
502_make_offset_tests()
503
504if __name__ == "__main__":
Dmitri Prime495d3f22024-09-06 16:56:59 -0700505 unittest.main()