| # Copyright 2019 Google LLC |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Tokenization for the Emboss definition language. |
| |
| This module exports the tokenize function and various errors. |
| |
| In addition, a couple of lists are exported for the use of |
| generate_grammar_md.py: |
| |
| LITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against |
| input. |
| REGEX_TOKEN_PATTERNS: A list of regexes used for tokenization. |
| REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject |
| (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and |
| REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens |
| which match the pattern. |
| """ |
| |
| import collections |
| import re |
| |
| from compiler.util import error |
| from compiler.util import parser_types |
| |
| |
| def tokenize(text, file_name): |
| # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched |
| # delimiters ([], (), and {}). |
| """Tokenizes its argument. |
| |
| Arguments: |
| text: The raw text of a .emb file. |
| file_name: The name of the file to use in errors. |
| |
| Returns: |
| A tuple of: |
| a list of parser_types.Tokens or None |
| a possibly-empty list of errors. |
| """ |
| tokens = [] |
| indent_stack = [""] |
| line_number = 0 |
| for line in text.splitlines(): |
| line_number += 1 |
| |
| # _tokenize_line splits the actual text into tokens. |
| line_tokens, errors = _tokenize_line(line, line_number, file_name) |
| if errors: |
| return None, errors |
| |
| # Lines with only whitespace and comments are not used for Indent/Dedent |
| # calculation, and do not produce end-of-line tokens. |
| for token in line_tokens: |
| if token.symbol != "Comment": |
| break |
| else: |
| tokens.extend(line_tokens) |
| tokens.append( |
| parser_types.Token( |
| '"\\n"', |
| "\n", |
| parser_types.make_location( |
| (line_number, len(line) + 1), (line_number, len(line) + 1) |
| ), |
| ) |
| ) |
| continue |
| |
| # Leading whitespace is whatever .lstrip() removes. |
| leading_whitespace = line[0 : len(line) - len(line.lstrip())] |
| if leading_whitespace == indent_stack[-1]: |
| # If the current leading whitespace is equal to the last leading |
| # whitespace, do not emit an Indent or Dedent token. |
| pass |
| elif leading_whitespace.startswith(indent_stack[-1]): |
| # If the current leading whitespace is longer than the last leading |
| # whitespace, emit an Indent token. For the token text, take the new |
| # part of the whitespace. |
| tokens.append( |
| parser_types.Token( |
| "Indent", |
| leading_whitespace[len(indent_stack[-1]) :], |
| parser_types.make_location( |
| (line_number, len(indent_stack[-1]) + 1), |
| (line_number, len(leading_whitespace) + 1), |
| ), |
| ) |
| ) |
| indent_stack.append(leading_whitespace) |
| else: |
| # Otherwise, search for the unclosed indentation level that matches |
| # the current indentation level. Emit a Dedent token for each |
| # newly-closed indentation level. |
| for i in range(len(indent_stack) - 1, -1, -1): |
| if leading_whitespace == indent_stack[i]: |
| break |
| tokens.append( |
| parser_types.Token( |
| "Dedent", |
| "", |
| parser_types.make_location( |
| (line_number, len(leading_whitespace) + 1), |
| (line_number, len(leading_whitespace) + 1), |
| ), |
| ) |
| ) |
| del indent_stack[i] |
| else: |
| return None, [ |
| [ |
| error.error( |
| file_name, |
| parser_types.make_location( |
| (line_number, 1), |
| (line_number, len(leading_whitespace) + 1), |
| ), |
| "Bad indentation", |
| ) |
| ] |
| ] |
| |
| tokens.extend(line_tokens) |
| |
| # Append an end-of-line token (for non-whitespace lines). |
| tokens.append( |
| parser_types.Token( |
| '"\\n"', |
| "\n", |
| parser_types.make_location( |
| (line_number, len(line) + 1), (line_number, len(line) + 1) |
| ), |
| ) |
| ) |
| for i in range(len(indent_stack) - 1): |
| tokens.append( |
| parser_types.Token( |
| "Dedent", |
| "", |
| parser_types.make_location((line_number + 1, 1), (line_number + 1, 1)), |
| ) |
| ) |
| return tokens, [] |
| |
| |
| # Token patterns used by _tokenize_line. |
| LITERAL_TOKEN_PATTERNS = ( |
| "[ ] ( ) : = + - * . ? == != && || < > <= >= , " |
| "$static_size_in_bits $is_statically_sized " |
| "$max $present $upper_bound $lower_bound $next " |
| "$size_in_bits $size_in_bytes " |
| "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes " |
| "$default struct bits enum external import as if let" |
| ).split() |
| _T = collections.namedtuple("T", ["regex", "symbol"]) |
| REGEX_TOKEN_PATTERNS = [ |
| # Words starting with variations of "emboss reserved" are reserved for |
| # internal use by the Emboss compiler. |
| _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"), |
| _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"), |
| _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"), |
| _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"), |
| _T(re.compile("[0-9]+"), "Number"), |
| _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"), |
| _T(re.compile("0x[0-9a-fA-F]+"), "Number"), |
| _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"), |
| _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"), |
| _T(re.compile("0b[01]+"), "Number"), |
| _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"), |
| _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"), |
| _T(re.compile("true|false"), "BooleanConstant"), |
| _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"), |
| # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number |
| # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords. A |
| # ShoutyWord must start with an upper case letter and contain at least one |
| # more upper case letter or '_'. |
| _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"), |
| # A CamelWord starts with A-Z and contains at least one a-z, and no _. |
| _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"), |
| _T(re.compile("-- .*"), "Documentation"), |
| _T(re.compile("--$"), "Documentation"), |
| _T(re.compile("--.*"), "BadDocumentation"), |
| _T(re.compile(r"\s+"), None), |
| _T(re.compile("#.*"), "Comment"), |
| # BadWord and BadNumber are a catch-alls for words and numbers so that |
| # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord]. |
| # |
| # This is preferable to returning an error because the BadWord and BadNumber |
| # token types can be used in example-based errors. |
| _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"), |
| _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"), |
| ] |
| del _T |
| |
| |
| def _tokenize_line(line, line_number, file_name): |
| """Tokenizes a single line of input. |
| |
| Arguments: |
| line: The line of text to tokenize. |
| line_number: The line number (used when constructing token objects). |
| file_name: The name of a file to use in errors. |
| |
| Returns: |
| A tuple of: |
| A list of token objects or None. |
| A possibly-empty list of errors. |
| """ |
| tokens = [] |
| offset = 0 |
| while offset < len(line): |
| best_candidate = "" |
| best_candidate_symbol = None |
| # Find the longest match. Ties go to the first match. This way, keywords |
| # ("struct") are matched as themselves, but words that only happen to start |
| # with keywords ("structure") are matched as words. |
| # |
| # There is never a reason to try to match a literal after a regex that |
| # could also match that literal, so check literals first. |
| for literal in LITERAL_TOKEN_PATTERNS: |
| if line[offset:].startswith(literal) and len(literal) > len(best_candidate): |
| best_candidate = literal |
| # For Emboss, the name of a literal token is just the literal in quotes, |
| # so that the grammar can read a little more naturally, e.g.: |
| # |
| # expression -> expression "+" expression |
| # |
| # instead of |
| # |
| # expression -> expression Plus expression |
| best_candidate_symbol = '"' + literal + '"' |
| for pattern in REGEX_TOKEN_PATTERNS: |
| match_result = pattern.regex.match(line[offset:]) |
| if match_result and len(match_result.group(0)) > len(best_candidate): |
| best_candidate = match_result.group(0) |
| best_candidate_symbol = pattern.symbol |
| if not best_candidate: |
| return None, [ |
| [ |
| error.error( |
| file_name, |
| parser_types.make_location( |
| (line_number, offset + 1), (line_number, offset + 2) |
| ), |
| "Unrecognized token", |
| ) |
| ] |
| ] |
| if best_candidate_symbol: |
| tokens.append( |
| parser_types.Token( |
| best_candidate_symbol, |
| best_candidate, |
| parser_types.make_location( |
| (line_number, offset + 1), |
| (line_number, offset + len(best_candidate) + 1), |
| ), |
| ) |
| ) |
| offset += len(best_candidate) |
| return tokens, None |