Initial open-source commit of Emboss.

commit: c0d7784b29194d36c84914005d634fc2a962770a [log] [tgz]
author: Ben Olmstead <bolms@google.com> Wed Jul 31 17:34:05 2019 -0700
committer: Ben Olmstead <bolms@google.com> Wed Jul 31 17:34:05 2019 -0700
tree: 418f8e110f116bc821c8fef8bc5461e3a99b95b1
diff --git a/front_end/tokenizer.py b/front_end/tokenizer.py
new file mode 100644
index 0000000..a9d005a
--- /dev/null
+++ b/front_end/tokenizer.py

@@ -0,0 +1,223 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization for the Emboss definition language.
+
+This module exports the tokenize function and various errors.
+
+In addition, a couple of lists are exported for the use of
+generate_grammar_md.py:
+
+LITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against
+  input.
+REGEX_TOKEN_PATTERNS: A list of regexes used for tokenization.
+  REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject
+  (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and
+  REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens
+  which match the pattern.
+"""
+
+import collections
+import re
+
+from util import error
+from util import parser_types
+
+
+def tokenize(text, file_name):
+  # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched
+  # delimiters ([], (), and {}).
+  """Tokenizes its argument.
+
+  Arguments:
+    text: The raw text of a .emb file.
+    file_name: The name of the file to use in errors.
+
+  Returns:
+    A tuple of:
+      a list of parser_types.Tokens or None
+      a possibly-empty list of errors.
+  """
+  tokens = []
+  indent_stack = [""]
+  line_number = 0
+  for line in text.splitlines():
+    line_number += 1
+
+    # _tokenize_line splits the actual text into tokens.
+    line_tokens, errors = _tokenize_line(line, line_number, file_name)
+    if errors:
+      return None, errors
+
+    # Lines with only whitespace and comments are not used for Indent/Dedent
+    # calculation, and do not produce end-of-line tokens.
+    for token in line_tokens:
+      if token.symbol != "Comment":
+        break
+    else:
+      tokens.extend(line_tokens)
+      tokens.append(parser_types.Token(
+          '"\\n"', "\n", parser_types.make_location(
+              (line_number, len(line) + 1), (line_number, len(line) + 1))))
+      continue
+
+    # Leading whitespace is whatever .lstrip() removes.
+    leading_whitespace = line[0:len(line) - len(line.lstrip())]
+    if leading_whitespace == indent_stack[-1]:
+      # If the current leading whitespace is equal to the last leading
+      # whitespace, do not emit an Indent or Dedent token.
+      pass
+    elif leading_whitespace.startswith(indent_stack[-1]):
+      # If the current leading whitespace is longer than the last leading
+      # whitespace, emit an Indent token.  For the token text, take the new
+      # part of the whitespace.
+      tokens.append(
+          parser_types.Token(
+              "Indent", leading_whitespace[len(indent_stack[-1]):],
+              parser_types.make_location(
+                  (line_number, len(indent_stack[-1]) + 1),
+                  (line_number, len(leading_whitespace) + 1))))
+      indent_stack.append(leading_whitespace)
+    else:
+      # Otherwise, search for the unclosed indentation level that matches
+      # the current indentation level.  Emit a Dedent token for each
+      # newly-closed indentation level.
+      for i in range(len(indent_stack) - 1, -1, -1):
+        if leading_whitespace == indent_stack[i]:
+          break
+        tokens.append(
+            parser_types.Token("Dedent", "", parser_types.make_location(
+                (line_number, len(leading_whitespace) + 1),
+                (line_number, len(leading_whitespace) + 1))))
+        del indent_stack[i]
+      else:
+        return None, [[error.error(
+            file_name, parser_types.make_location(
+                (line_number, 1), (line_number, len(leading_whitespace) + 1)),
+            "Bad indentation")]]
+
+    tokens.extend(line_tokens)
+
+    # Append an end-of-line token (for non-whitespace lines).
+    tokens.append(parser_types.Token(
+        '"\\n"', "\n", parser_types.make_location(
+            (line_number, len(line) + 1), (line_number, len(line) + 1))))
+  for i in range(len(indent_stack) - 1):
+    tokens.append(parser_types.Token("Dedent", "", parser_types.make_location(
+        (line_number + 1, 1), (line_number + 1, 1))))
+  return tokens, []
+
+# Token patterns used by _tokenize_line.
+LITERAL_TOKEN_PATTERNS = (
+    "[ ] ( ) : = + - * . ? == != && || < > <= >= , "
+    "$static_size_in_bits $is_statically_sized "
+    "$max $present $upper_bound $lower_bound "
+    "$size_in_bits $size_in_bytes "
+    "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes "
+    "$default struct bits enum external import as if let").split()
+_T = collections.namedtuple("T", ["regex", "symbol"])
+REGEX_TOKEN_PATTERNS = [
+    # Words starting with variations of "emboss reserved" are reserved for
+    # internal use by the Emboss compiler.
+    _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"),
+    _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"),
+    _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"),
+    _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"),
+    _T(re.compile("[0-9]+"), "Number"),
+    _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"),
+    _T(re.compile("0x[0-9a-fA-F]+"), "Number"),
+    _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"),
+    _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"),
+    _T(re.compile("0b[01]+"), "Number"),
+    _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"),
+    _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"),
+    _T(re.compile("true|false"), "BooleanConstant"),
+    _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"),
+    # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number
+    # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords.  A
+    # ShoutyWord must start with an upper case letter and contain at least one
+    # more upper case letter or '_'.
+    _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"),
+    # A CamelWord starts with A-Z and contains at least one a-z, and no _.
+    _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"),
+    _T(re.compile("-- .*"), "Documentation"),
+    _T(re.compile("--$"), "Documentation"),
+    _T(re.compile("--.*"), "BadDocumentation"),
+    _T(re.compile(r"\s+"), None),
+    _T(re.compile("#.*"), "Comment"),
+    # BadWord and BadNumber are a catch-alls for words and numbers so that
+    # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord].
+    #
+    # This is preferable to returning an error because the BadWord and BadNumber
+    # token types can be used in example-based errors.
+    _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"),
+    _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"),
+]
+del _T
+
+
+def _tokenize_line(line, line_number, file_name):
+  """Tokenizes a single line of input.
+
+  Arguments:
+    line: The line of text to tokenize.
+    line_number: The line number (used when constructing token objects).
+    file_name: The name of a file to use in errors.
+
+  Returns:
+    A tuple of:
+      A list of token objects or None.
+      A possibly-empty list of errors.
+  """
+  tokens = []
+  offset = 0
+  while offset < len(line):
+    best_candidate = ""
+    best_candidate_symbol = None
+    # Find the longest match.  Ties go to the first match.  This way, keywords
+    # ("struct") are matched as themselves, but words that only happen to start
+    # with keywords ("structure") are matched as words.
+    #
+    # There is never a reason to try to match a literal after a regex that
+    # could also match that literal, so check literals first.
+    for literal in LITERAL_TOKEN_PATTERNS:
+      if line[offset:].startswith(literal) and len(literal) > len(
+          best_candidate):
+        best_candidate = literal
+        # For Emboss, the name of a literal token is just the literal in quotes,
+        # so that the grammar can read a little more naturally, e.g.:
+        #
+        #     expression -> expression "+" expression
+        #
+        # instead of
+        #
+        #     expression -> expression Plus expression
+        best_candidate_symbol = '"' + literal + '"'
+    for pattern in REGEX_TOKEN_PATTERNS:
+      match_result = pattern.regex.match(line[offset:])
+      if match_result and len(match_result.group(0)) > len(best_candidate):
+        best_candidate = match_result.group(0)
+        best_candidate_symbol = pattern.symbol
+    if not best_candidate:
+      return None, [[error.error(
+          file_name, parser_types.make_location(
+              (line_number, offset + 1), (line_number, offset + 2)),
+          "Unrecognized token")]]
+    if best_candidate_symbol:
+      tokens.append(parser_types.Token(
+          best_candidate_symbol, best_candidate, parser_types.make_location(
+              (line_number, offset + 1),
+              (line_number, offset + len(best_candidate) + 1))))
+    offset += len(best_candidate)
+  return tokens, None
commit	c0d7784b29194d36c84914005d634fc2a962770a	[log] [tgz]
author	Ben Olmstead <bolms@google.com>	Wed Jul 31 17:34:05 2019 -0700
committer	Ben Olmstead <bolms@google.com>	Wed Jul 31 17:34:05 2019 -0700
tree	418f8e110f116bc821c8fef8bc5461e3a99b95b1