blob: defee343159d2a60be26170cb4cc2c95aa07bb19 [file] [log] [blame]
Ben Olmsteadc0d77842019-07-31 17:34:05 -07001# Copyright 2019 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Tokenization for the Emboss definition language.
16
17This module exports the tokenize function and various errors.
18
19In addition, a couple of lists are exported for the use of
20generate_grammar_md.py:
21
22LITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against
23 input.
24REGEX_TOKEN_PATTERNS: A list of regexes used for tokenization.
25 REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject
26 (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and
27 REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens
28 which match the pattern.
29"""
30
31import collections
32import re
33
reventlov6731fc42019-10-03 15:23:13 -070034from compiler.util import error
35from compiler.util import parser_types
Ben Olmsteadc0d77842019-07-31 17:34:05 -070036
37
38def tokenize(text, file_name):
Dmitri Prime495d3f22024-09-06 16:56:59 -070039 # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched
40 # delimiters ([], (), and {}).
41 """Tokenizes its argument.
Ben Olmsteadc0d77842019-07-31 17:34:05 -070042
Dmitri Prime495d3f22024-09-06 16:56:59 -070043 Arguments:
44 text: The raw text of a .emb file.
45 file_name: The name of the file to use in errors.
Ben Olmsteadc0d77842019-07-31 17:34:05 -070046
Dmitri Prime495d3f22024-09-06 16:56:59 -070047 Returns:
48 A tuple of:
49 a list of parser_types.Tokens or None
50 a possibly-empty list of errors.
51 """
52 tokens = []
53 indent_stack = [""]
54 line_number = 0
55 for line in text.splitlines():
56 line_number += 1
Ben Olmsteadc0d77842019-07-31 17:34:05 -070057
Dmitri Prime495d3f22024-09-06 16:56:59 -070058 # _tokenize_line splits the actual text into tokens.
59 line_tokens, errors = _tokenize_line(line, line_number, file_name)
60 if errors:
61 return None, errors
Ben Olmsteadc0d77842019-07-31 17:34:05 -070062
Dmitri Prime495d3f22024-09-06 16:56:59 -070063 # Lines with only whitespace and comments are not used for Indent/Dedent
64 # calculation, and do not produce end-of-line tokens.
65 for token in line_tokens:
66 if token.symbol != "Comment":
67 break
68 else:
69 tokens.extend(line_tokens)
70 tokens.append(
71 parser_types.Token(
72 '"\\n"',
73 "\n",
74 parser_types.make_location(
75 (line_number, len(line) + 1), (line_number, len(line) + 1)
76 ),
77 )
78 )
79 continue
Ben Olmsteadc0d77842019-07-31 17:34:05 -070080
Dmitri Prime495d3f22024-09-06 16:56:59 -070081 # Leading whitespace is whatever .lstrip() removes.
82 leading_whitespace = line[0 : len(line) - len(line.lstrip())]
83 if leading_whitespace == indent_stack[-1]:
84 # If the current leading whitespace is equal to the last leading
85 # whitespace, do not emit an Indent or Dedent token.
86 pass
87 elif leading_whitespace.startswith(indent_stack[-1]):
88 # If the current leading whitespace is longer than the last leading
89 # whitespace, emit an Indent token. For the token text, take the new
90 # part of the whitespace.
91 tokens.append(
92 parser_types.Token(
93 "Indent",
94 leading_whitespace[len(indent_stack[-1]) :],
95 parser_types.make_location(
96 (line_number, len(indent_stack[-1]) + 1),
97 (line_number, len(leading_whitespace) + 1),
98 ),
99 )
100 )
101 indent_stack.append(leading_whitespace)
102 else:
103 # Otherwise, search for the unclosed indentation level that matches
104 # the current indentation level. Emit a Dedent token for each
105 # newly-closed indentation level.
106 for i in range(len(indent_stack) - 1, -1, -1):
107 if leading_whitespace == indent_stack[i]:
108 break
109 tokens.append(
110 parser_types.Token(
111 "Dedent",
112 "",
113 parser_types.make_location(
114 (line_number, len(leading_whitespace) + 1),
115 (line_number, len(leading_whitespace) + 1),
116 ),
117 )
118 )
119 del indent_stack[i]
120 else:
121 return None, [
122 [
123 error.error(
124 file_name,
125 parser_types.make_location(
126 (line_number, 1),
127 (line_number, len(leading_whitespace) + 1),
128 ),
129 "Bad indentation",
130 )
131 ]
132 ]
133
134 tokens.extend(line_tokens)
135
136 # Append an end-of-line token (for non-whitespace lines).
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700137 tokens.append(
Dmitri Prime495d3f22024-09-06 16:56:59 -0700138 parser_types.Token(
139 '"\\n"',
140 "\n",
141 parser_types.make_location(
142 (line_number, len(line) + 1), (line_number, len(line) + 1)
143 ),
144 )
145 )
146 for i in range(len(indent_stack) - 1):
147 tokens.append(
148 parser_types.Token(
149 "Dedent",
150 "",
151 parser_types.make_location((line_number + 1, 1), (line_number + 1, 1)),
152 )
153 )
154 return tokens, []
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700155
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700156
157# Token patterns used by _tokenize_line.
158LITERAL_TOKEN_PATTERNS = (
159 "[ ] ( ) : = + - * . ? == != && || < > <= >= , "
160 "$static_size_in_bits $is_statically_sized "
reventlov4000cd02022-04-05 12:15:39 -0700161 "$max $present $upper_bound $lower_bound $next "
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700162 "$size_in_bits $size_in_bytes "
163 "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes "
Dmitri Prime495d3f22024-09-06 16:56:59 -0700164 "$default struct bits enum external import as if let"
165).split()
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700166_T = collections.namedtuple("T", ["regex", "symbol"])
167REGEX_TOKEN_PATTERNS = [
168 # Words starting with variations of "emboss reserved" are reserved for
169 # internal use by the Emboss compiler.
170 _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"),
171 _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"),
172 _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"),
173 _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"),
174 _T(re.compile("[0-9]+"), "Number"),
175 _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"),
176 _T(re.compile("0x[0-9a-fA-F]+"), "Number"),
177 _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"),
178 _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"),
179 _T(re.compile("0b[01]+"), "Number"),
180 _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"),
181 _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"),
182 _T(re.compile("true|false"), "BooleanConstant"),
183 _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"),
184 # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number
185 # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords. A
186 # ShoutyWord must start with an upper case letter and contain at least one
187 # more upper case letter or '_'.
188 _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"),
189 # A CamelWord starts with A-Z and contains at least one a-z, and no _.
190 _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"),
191 _T(re.compile("-- .*"), "Documentation"),
192 _T(re.compile("--$"), "Documentation"),
193 _T(re.compile("--.*"), "BadDocumentation"),
194 _T(re.compile(r"\s+"), None),
195 _T(re.compile("#.*"), "Comment"),
196 # BadWord and BadNumber are a catch-alls for words and numbers so that
197 # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord].
198 #
199 # This is preferable to returning an error because the BadWord and BadNumber
200 # token types can be used in example-based errors.
201 _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"),
202 _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"),
203]
204del _T
205
206
207def _tokenize_line(line, line_number, file_name):
Dmitri Prime495d3f22024-09-06 16:56:59 -0700208 """Tokenizes a single line of input.
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700209
Dmitri Prime495d3f22024-09-06 16:56:59 -0700210 Arguments:
211 line: The line of text to tokenize.
212 line_number: The line number (used when constructing token objects).
213 file_name: The name of a file to use in errors.
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700214
Dmitri Prime495d3f22024-09-06 16:56:59 -0700215 Returns:
216 A tuple of:
217 A list of token objects or None.
218 A possibly-empty list of errors.
219 """
220 tokens = []
221 offset = 0
222 while offset < len(line):
223 best_candidate = ""
224 best_candidate_symbol = None
225 # Find the longest match. Ties go to the first match. This way, keywords
226 # ("struct") are matched as themselves, but words that only happen to start
227 # with keywords ("structure") are matched as words.
Ben Olmsteadc0d77842019-07-31 17:34:05 -0700228 #
Dmitri Prime495d3f22024-09-06 16:56:59 -0700229 # There is never a reason to try to match a literal after a regex that
230 # could also match that literal, so check literals first.
231 for literal in LITERAL_TOKEN_PATTERNS:
232 if line[offset:].startswith(literal) and len(literal) > len(best_candidate):
233 best_candidate = literal
234 # For Emboss, the name of a literal token is just the literal in quotes,
235 # so that the grammar can read a little more naturally, e.g.:
236 #
237 # expression -> expression "+" expression
238 #
239 # instead of
240 #
241 # expression -> expression Plus expression
242 best_candidate_symbol = '"' + literal + '"'
243 for pattern in REGEX_TOKEN_PATTERNS:
244 match_result = pattern.regex.match(line[offset:])
245 if match_result and len(match_result.group(0)) > len(best_candidate):
246 best_candidate = match_result.group(0)
247 best_candidate_symbol = pattern.symbol
248 if not best_candidate:
249 return None, [
250 [
251 error.error(
252 file_name,
253 parser_types.make_location(
254 (line_number, offset + 1), (line_number, offset + 2)
255 ),
256 "Unrecognized token",
257 )
258 ]
259 ]
260 if best_candidate_symbol:
261 tokens.append(
262 parser_types.Token(
263 best_candidate_symbol,
264 best_candidate,
265 parser_types.make_location(
266 (line_number, offset + 1),
267 (line_number, offset + len(best_candidate) + 1),
268 ),
269 )
270 )
271 offset += len(best_candidate)
272 return tokens, None