Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 1 | # Copyright 2019 Google LLC |
| 2 | # |
| 3 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | # you may not use this file except in compliance with the License. |
| 5 | # You may obtain a copy of the License at |
| 6 | # |
| 7 | # https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | # |
| 9 | # Unless required by applicable law or agreed to in writing, software |
| 10 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | # See the License for the specific language governing permissions and |
| 13 | # limitations under the License. |
| 14 | |
| 15 | """Tokenization for the Emboss definition language. |
| 16 | |
| 17 | This module exports the tokenize function and various errors. |
| 18 | |
| 19 | In addition, a couple of lists are exported for the use of |
| 20 | generate_grammar_md.py: |
| 21 | |
| 22 | LITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against |
| 23 | input. |
| 24 | REGEX_TOKEN_PATTERNS: A list of regexes used for tokenization. |
| 25 | REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject |
| 26 | (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and |
| 27 | REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens |
| 28 | which match the pattern. |
| 29 | """ |
| 30 | |
| 31 | import collections |
| 32 | import re |
| 33 | |
reventlov | 6731fc4 | 2019-10-03 15:23:13 -0700 | [diff] [blame] | 34 | from compiler.util import error |
| 35 | from compiler.util import parser_types |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 36 | |
| 37 | |
| 38 | def tokenize(text, file_name): |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 39 | # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched |
| 40 | # delimiters ([], (), and {}). |
| 41 | """Tokenizes its argument. |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 42 | |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 43 | Arguments: |
| 44 | text: The raw text of a .emb file. |
| 45 | file_name: The name of the file to use in errors. |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 46 | |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 47 | Returns: |
| 48 | A tuple of: |
| 49 | a list of parser_types.Tokens or None |
| 50 | a possibly-empty list of errors. |
| 51 | """ |
| 52 | tokens = [] |
| 53 | indent_stack = [""] |
| 54 | line_number = 0 |
| 55 | for line in text.splitlines(): |
| 56 | line_number += 1 |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 57 | |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 58 | # _tokenize_line splits the actual text into tokens. |
| 59 | line_tokens, errors = _tokenize_line(line, line_number, file_name) |
| 60 | if errors: |
| 61 | return None, errors |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 62 | |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 63 | # Lines with only whitespace and comments are not used for Indent/Dedent |
| 64 | # calculation, and do not produce end-of-line tokens. |
| 65 | for token in line_tokens: |
| 66 | if token.symbol != "Comment": |
| 67 | break |
| 68 | else: |
| 69 | tokens.extend(line_tokens) |
| 70 | tokens.append( |
| 71 | parser_types.Token( |
| 72 | '"\\n"', |
| 73 | "\n", |
| 74 | parser_types.make_location( |
| 75 | (line_number, len(line) + 1), (line_number, len(line) + 1) |
| 76 | ), |
| 77 | ) |
| 78 | ) |
| 79 | continue |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 80 | |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 81 | # Leading whitespace is whatever .lstrip() removes. |
| 82 | leading_whitespace = line[0 : len(line) - len(line.lstrip())] |
| 83 | if leading_whitespace == indent_stack[-1]: |
| 84 | # If the current leading whitespace is equal to the last leading |
| 85 | # whitespace, do not emit an Indent or Dedent token. |
| 86 | pass |
| 87 | elif leading_whitespace.startswith(indent_stack[-1]): |
| 88 | # If the current leading whitespace is longer than the last leading |
| 89 | # whitespace, emit an Indent token. For the token text, take the new |
| 90 | # part of the whitespace. |
| 91 | tokens.append( |
| 92 | parser_types.Token( |
| 93 | "Indent", |
| 94 | leading_whitespace[len(indent_stack[-1]) :], |
| 95 | parser_types.make_location( |
| 96 | (line_number, len(indent_stack[-1]) + 1), |
| 97 | (line_number, len(leading_whitespace) + 1), |
| 98 | ), |
| 99 | ) |
| 100 | ) |
| 101 | indent_stack.append(leading_whitespace) |
| 102 | else: |
| 103 | # Otherwise, search for the unclosed indentation level that matches |
| 104 | # the current indentation level. Emit a Dedent token for each |
| 105 | # newly-closed indentation level. |
| 106 | for i in range(len(indent_stack) - 1, -1, -1): |
| 107 | if leading_whitespace == indent_stack[i]: |
| 108 | break |
| 109 | tokens.append( |
| 110 | parser_types.Token( |
| 111 | "Dedent", |
| 112 | "", |
| 113 | parser_types.make_location( |
| 114 | (line_number, len(leading_whitespace) + 1), |
| 115 | (line_number, len(leading_whitespace) + 1), |
| 116 | ), |
| 117 | ) |
| 118 | ) |
| 119 | del indent_stack[i] |
| 120 | else: |
| 121 | return None, [ |
| 122 | [ |
| 123 | error.error( |
| 124 | file_name, |
| 125 | parser_types.make_location( |
| 126 | (line_number, 1), |
| 127 | (line_number, len(leading_whitespace) + 1), |
| 128 | ), |
| 129 | "Bad indentation", |
| 130 | ) |
| 131 | ] |
| 132 | ] |
| 133 | |
| 134 | tokens.extend(line_tokens) |
| 135 | |
| 136 | # Append an end-of-line token (for non-whitespace lines). |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 137 | tokens.append( |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 138 | parser_types.Token( |
| 139 | '"\\n"', |
| 140 | "\n", |
| 141 | parser_types.make_location( |
| 142 | (line_number, len(line) + 1), (line_number, len(line) + 1) |
| 143 | ), |
| 144 | ) |
| 145 | ) |
| 146 | for i in range(len(indent_stack) - 1): |
| 147 | tokens.append( |
| 148 | parser_types.Token( |
| 149 | "Dedent", |
| 150 | "", |
| 151 | parser_types.make_location((line_number + 1, 1), (line_number + 1, 1)), |
| 152 | ) |
| 153 | ) |
| 154 | return tokens, [] |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 155 | |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 156 | |
| 157 | # Token patterns used by _tokenize_line. |
| 158 | LITERAL_TOKEN_PATTERNS = ( |
| 159 | "[ ] ( ) : = + - * . ? == != && || < > <= >= , " |
| 160 | "$static_size_in_bits $is_statically_sized " |
reventlov | 4000cd0 | 2022-04-05 12:15:39 -0700 | [diff] [blame] | 161 | "$max $present $upper_bound $lower_bound $next " |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 162 | "$size_in_bits $size_in_bytes " |
| 163 | "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes " |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 164 | "$default struct bits enum external import as if let" |
| 165 | ).split() |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 166 | _T = collections.namedtuple("T", ["regex", "symbol"]) |
| 167 | REGEX_TOKEN_PATTERNS = [ |
| 168 | # Words starting with variations of "emboss reserved" are reserved for |
| 169 | # internal use by the Emboss compiler. |
| 170 | _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"), |
| 171 | _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"), |
| 172 | _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"), |
| 173 | _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"), |
| 174 | _T(re.compile("[0-9]+"), "Number"), |
| 175 | _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"), |
| 176 | _T(re.compile("0x[0-9a-fA-F]+"), "Number"), |
| 177 | _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"), |
| 178 | _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"), |
| 179 | _T(re.compile("0b[01]+"), "Number"), |
| 180 | _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"), |
| 181 | _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"), |
| 182 | _T(re.compile("true|false"), "BooleanConstant"), |
| 183 | _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"), |
| 184 | # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number |
| 185 | # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords. A |
| 186 | # ShoutyWord must start with an upper case letter and contain at least one |
| 187 | # more upper case letter or '_'. |
| 188 | _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"), |
| 189 | # A CamelWord starts with A-Z and contains at least one a-z, and no _. |
| 190 | _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"), |
| 191 | _T(re.compile("-- .*"), "Documentation"), |
| 192 | _T(re.compile("--$"), "Documentation"), |
| 193 | _T(re.compile("--.*"), "BadDocumentation"), |
| 194 | _T(re.compile(r"\s+"), None), |
| 195 | _T(re.compile("#.*"), "Comment"), |
| 196 | # BadWord and BadNumber are a catch-alls for words and numbers so that |
| 197 | # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord]. |
| 198 | # |
| 199 | # This is preferable to returning an error because the BadWord and BadNumber |
| 200 | # token types can be used in example-based errors. |
| 201 | _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"), |
| 202 | _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"), |
| 203 | ] |
| 204 | del _T |
| 205 | |
| 206 | |
| 207 | def _tokenize_line(line, line_number, file_name): |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 208 | """Tokenizes a single line of input. |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 209 | |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 210 | Arguments: |
| 211 | line: The line of text to tokenize. |
| 212 | line_number: The line number (used when constructing token objects). |
| 213 | file_name: The name of a file to use in errors. |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 214 | |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 215 | Returns: |
| 216 | A tuple of: |
| 217 | A list of token objects or None. |
| 218 | A possibly-empty list of errors. |
| 219 | """ |
| 220 | tokens = [] |
| 221 | offset = 0 |
| 222 | while offset < len(line): |
| 223 | best_candidate = "" |
| 224 | best_candidate_symbol = None |
| 225 | # Find the longest match. Ties go to the first match. This way, keywords |
| 226 | # ("struct") are matched as themselves, but words that only happen to start |
| 227 | # with keywords ("structure") are matched as words. |
Ben Olmstead | c0d7784 | 2019-07-31 17:34:05 -0700 | [diff] [blame] | 228 | # |
Dmitri Prime | 495d3f2 | 2024-09-06 16:56:59 -0700 | [diff] [blame] | 229 | # There is never a reason to try to match a literal after a regex that |
| 230 | # could also match that literal, so check literals first. |
| 231 | for literal in LITERAL_TOKEN_PATTERNS: |
| 232 | if line[offset:].startswith(literal) and len(literal) > len(best_candidate): |
| 233 | best_candidate = literal |
| 234 | # For Emboss, the name of a literal token is just the literal in quotes, |
| 235 | # so that the grammar can read a little more naturally, e.g.: |
| 236 | # |
| 237 | # expression -> expression "+" expression |
| 238 | # |
| 239 | # instead of |
| 240 | # |
| 241 | # expression -> expression Plus expression |
| 242 | best_candidate_symbol = '"' + literal + '"' |
| 243 | for pattern in REGEX_TOKEN_PATTERNS: |
| 244 | match_result = pattern.regex.match(line[offset:]) |
| 245 | if match_result and len(match_result.group(0)) > len(best_candidate): |
| 246 | best_candidate = match_result.group(0) |
| 247 | best_candidate_symbol = pattern.symbol |
| 248 | if not best_candidate: |
| 249 | return None, [ |
| 250 | [ |
| 251 | error.error( |
| 252 | file_name, |
| 253 | parser_types.make_location( |
| 254 | (line_number, offset + 1), (line_number, offset + 2) |
| 255 | ), |
| 256 | "Unrecognized token", |
| 257 | ) |
| 258 | ] |
| 259 | ] |
| 260 | if best_candidate_symbol: |
| 261 | tokens.append( |
| 262 | parser_types.Token( |
| 263 | best_candidate_symbol, |
| 264 | best_candidate, |
| 265 | parser_types.make_location( |
| 266 | (line_number, offset + 1), |
| 267 | (line_number, offset + len(best_candidate) + 1), |
| 268 | ), |
| 269 | ) |
| 270 | ) |
| 271 | offset += len(best_candidate) |
| 272 | return tokens, None |