blob: 53c9e50b4a81a0254f42361e8596b8adf1f08f80 [file] [log] [blame]
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generates a Markdown file documenting the raw Emboss grammar."""
from __future__ import print_function
import re
import sys
from compiler.front_end import constraints
from compiler.front_end import module_ir
from compiler.front_end import tokenizer
# Keep the output to less than 80 columns, so that the preformatted sections are
# not cut off.
_MAX_OUTPUT_WIDTH = 80
_HEADER = """
This is the context-free grammar for Emboss. Terminal symbols are in `"quotes"`
or are named in `CamelCase`; nonterminal symbols are named in `snake_case`. The
term `<empty>` to the right of the `->` indicates an empty production (a rule
where the left-hand-side may be parsed from an empty string).
This listing is auto-generated from the grammar defined in `module_ir.py`.
Note that, unlike in many languages, comments are included in the grammar. This
is so that comments can be handled more easily by the autoformatter; comments
are ignored by the compiler. This is distinct from *documentation*, which is
included in the IR for use by documentation generators.
""".lstrip()
_BOILERPLATE_PRODUCTION_HEADER = """
The following productions are automatically generated to handle zero-or-more,
one-or-more, and zero-or-one repeated lists (`foo*`, `foo+`, and `foo?`
nonterminals) in LR(1). They are included for completeness, but may be ignored
if you just want to understand the grammar.
"""
_TOKENIZER_RULE_HEADER = """
The following regexes are used to tokenize input into the corresponding symbols.
Note that the `Indent`, `Dedent`, and `EndOfLine` symbols are generated using
separate logic.
"""
_KEYWORDS_HEADER = """
The following {} keywords are reserved, but not used, by Emboss. They may not
be used as field, type, or enum value names.
"""
def _sort_productions(productions, start_symbol):
"""Sorts the given productions in a human-friendly order."""
productions_by_lhs = {}
for p in productions:
if p.lhs not in productions_by_lhs:
productions_by_lhs[p.lhs] = set()
productions_by_lhs[p.lhs].add(p)
queue = [start_symbol]
previously_queued_symbols = set(queue)
main_production_list = []
# This sorts productions depth-first. I'm not sure if it is better to sort
# them breadth-first or depth-first, or with some hybrid.
while queue:
symbol = queue.pop(-1)
if symbol not in productions_by_lhs:
continue
for production in sorted(productions_by_lhs[symbol]):
main_production_list.append(production)
for symbol in production.rhs:
# Skip boilerplate productions for now, but include their base
# production.
if symbol and symbol[-1] in "*+?":
symbol = symbol[0:-1]
if symbol not in previously_queued_symbols:
queue.append(symbol)
previously_queued_symbols.add(symbol)
# It's not particularly important to put boilerplate productions in any
# particular order.
boilerplate_production_list = sorted(set(productions) - set(main_production_list))
for production in boilerplate_production_list:
assert production.lhs[-1] in "*+?", "Found orphaned production {}".format(
production.lhs
)
assert set(productions) == set(main_production_list + boilerplate_production_list)
assert len(productions) == len(main_production_list) + len(
boilerplate_production_list
)
return main_production_list, boilerplate_production_list
def _word_wrap_at_column(words, width):
"""Wraps words to the specified width, and returns a list of wrapped lines."""
result = []
in_progress = []
for word in words:
if len(" ".join(in_progress + [word])) > width:
result.append(" ".join(in_progress))
assert len(result[-1]) <= width
in_progress = []
in_progress.append(word)
result.append(" ".join(in_progress))
assert len(result[-1]) <= width
return result
def _format_productions(productions):
"""Formats a list of productions for inclusion in a Markdown document."""
max_lhs_len = max([len(production.lhs) for production in productions])
# TODO(bolms): This highlighting is close for now, but not actually right.
result = ["```shell\n"]
last_lhs = None
for production in productions:
if last_lhs == production.lhs:
lhs = ""
delimiter = " |"
else:
lhs = production.lhs
delimiter = "->"
leader = "{lhs:{width}} {delimiter}".format(
lhs=lhs, width=max_lhs_len, delimiter=delimiter
)
for rhs_block in _word_wrap_at_column(
production.rhs or ["<empty>"], _MAX_OUTPUT_WIDTH - len(leader)
):
result.append("{leader} {rhs}\n".format(leader=leader, rhs=rhs_block))
leader = " " * len(leader)
last_lhs = production.lhs
result.append("```\n")
return "".join(result)
def _normalize_literal_patterns(literals):
"""Normalizes a list of strings to a list of (regex, symbol) pairs."""
return [
(re.sub(r"(\W)", r"\\\1", literal), '"' + literal + '"') for literal in literals
]
def _normalize_regex_patterns(regexes):
"""Normalizes a list of tokenizer regexes to a list of (regex, symbol)."""
# g3doc breaks up patterns containing '|' when they are inserted into a table,
# unless they're preceded by '\'. Note that other special characters,
# including '\', should *not* be escaped with '\'.
return [(re.sub(r"\|", r"\\|", r.regex.pattern), r.symbol) for r in regexes]
def _normalize_reserved_word_list(reserved_words):
"""Returns words that would be allowed as names if they were not reserved."""
interesting_reserved_words = []
for word in reserved_words:
tokens, errors = tokenizer.tokenize(word, "")
assert tokens and not errors, "Failed to tokenize " + word
if tokens[0].symbol in ["SnakeWord", "CamelWord", "ShoutyWord"]:
interesting_reserved_words.append(word)
return sorted(interesting_reserved_words)
def _format_token_rules(token_rules):
"""Formats a list of (pattern, symbol) pairs as a table."""
pattern_width = max([len(rule[0]) for rule in token_rules])
pattern_width += 2 # For the `` characters.
result = [
"{pat_header:{width}} | Symbol\n"
"{empty:-<{width}} | {empty:-<30}\n".format(
pat_header="Pattern", width=pattern_width, empty=""
)
]
for rule in token_rules:
if rule[1]:
symbol_name = "`" + rule[1] + "`"
else:
symbol_name = "*no symbol emitted*"
result.append(
"{pattern:{width}} | {symbol}\n".format(
pattern="`" + rule[0] + "`", width=pattern_width, symbol=symbol_name
)
)
return "".join(result)
def _format_keyword_list(reserved_words):
"""formats a list of reserved words."""
lines = []
current_line = ""
for word in reserved_words:
if len(current_line) + len(word) + 2 > 80:
lines.append(current_line)
current_line = ""
current_line += "`{}` ".format(word)
return "".join([line[:-1] + "\n" for line in lines])
def generate_grammar_md():
"""Generates up-to-date text for grammar.md."""
main_productions, boilerplate_productions = _sort_productions(
module_ir.PRODUCTIONS, module_ir.START_SYMBOL
)
result = [
_HEADER,
_format_productions(main_productions),
_BOILERPLATE_PRODUCTION_HEADER,
_format_productions(boilerplate_productions),
]
main_tokens = _normalize_literal_patterns(tokenizer.LITERAL_TOKEN_PATTERNS)
main_tokens += _normalize_regex_patterns(tokenizer.REGEX_TOKEN_PATTERNS)
result.append(_TOKENIZER_RULE_HEADER)
result.append(_format_token_rules(main_tokens))
reserved_words = _normalize_reserved_word_list(constraints.get_reserved_word_list())
result.append(_KEYWORDS_HEADER.format(len(reserved_words)))
result.append(_format_keyword_list(reserved_words))
return "".join(result)
def main(argv):
del argv # Unused.
print(generate_grammar_md(), end="")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))