blob: 8a3443aac45825aa4f36fcfce19f7c6c9d1b23f7 [file] [log] [blame]
#!/usr/bin/python3
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import keyword
import collections
import sys
from compiler.front_end import lr1
from compiler.front_end import make_parser
from compiler.util import parser_types
def _identifier(i):
"""Turns a number into a Python identifier.
Does not account for reserved words: your code will need to do so.
Args:
i: the number to translate
Returns:
A Python identifier word.
"""
idleads = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
idchars = idleads + "0123456789_"
header = idleads[i % len(idleads)]
i //= len(idleads)
while i:
header += idchars[i % len(idchars)]
i //= len(idchars)
return header
def as_py_source(parser, function_name):
"""Turns a Parser into Python source code.
Returns a string that is a definition of a Python function that will
recreate the given Parser, minus data that is only used for debugging.
This function does some work to make the source smaller, but specifically
does not do anything that would require real processing when the function
runs, such as decompressing a text stream.
Args:
parser: the Parser to serialize
function_name: the name of the function that should recreate `parser`
Returns:
Source to a Python function that recreates the given Parser.
"""
header = [
f"def {function_name}():\n" #
" P = parser_types.Production\n"
" S = lambda x: lr1.Shift(x, ())\n"
" R = lr1.Reduce\n"
" A = lr1.Accept\n"
" E = lr1.Error\n"
]
body = []
S = collections.namedtuple("S", "temp_ident")
def declaration(ident, s):
"""Returns a declaration of s with symbol-based compression."""
if isinstance(s, parser_types.Production):
r = [" ", ident, "=P(", sym(s.lhs), ","]
if len(s.rhs) == 0:
r += ["()"]
elif len(s.rhs) == 1:
r += ["(", sym(s.rhs[0]), ",)"]
else:
r += ["("]
for rhss in s.rhs:
r += [sym(rhss), ","]
del r[-1]
r += [")"]
r += [")\n"]
else:
r = [" ", ident, "=", repr(s), "\n"]
return r
# Bookkkeeping for sym()
counter = 0 # Next available ID.
symbols = {} # Map of object => S() placeholder
symbol_uses = {} # Map of object => count of uses
symbol_defs = [] # Map of object => declaration (may have placeholders)
def sym(s):
"""Returns a placeholder for s."""
nonlocal counter
if s not in symbols:
ident = S(counter)
counter += 1
symbols[s] = ident
symbol_uses[s] = 0
symbol_defs.extend(declaration(ident, s))
symbol_uses[s] += 1
return symbols[s]
# Serialize parser.productions. This is only used to sanity check the
# parser when it gets loaded: if the cached parser's production list does
# not match the production list from module_ir at load time, the cached
# parser will be ignored and a new parser will be generated at runtime.
#
# This is convenient when making changes to the Emboss grammar.
body.append(" prods = {\n")
for production in parser.productions:
body += [" ", sym(production), ",\n"]
body.append(" }\n")
# Serialize the GOTO table, one state per line.
body.append(" goto = {\n")
for key_state in sorted(parser.goto.keys()):
body.append(f" {key_state}:" "{")
goto_body = []
for key_symbol, goto_state in sorted(parser.goto[key_state].items()):
body += [sym(key_symbol), f":{goto_state}", ","]
del body[-1]
body.append("},\n")
body.append(" }\n")
# Serialize the ACTION table, one state per line.
body.append(" act = {\n")
for key_state in sorted(parser.action.keys()):
body.append(f" {key_state}:" "{")
for key_symbol, value in sorted(parser.action[key_state].items()):
body += [sym(key_symbol), ":"]
if isinstance(value, lr1.Shift):
# The `items` are not used for actual parsing, so they are
# discarded here.
body.append(f"S({value.state})")
elif isinstance(value, lr1.Reduce):
body += ["R(", sym(value.rule), ")"]
elif isinstance(value, lr1.Accept):
body.append("A()")
elif isinstance(value, lr1.Error):
body += ["E(", sym(value.code), ")"]
body.append(",")
del body[-1]
body.append("},\n")
body.append(" }\n")
# Serialize the default errors map.
body.append(" defe = {\n")
for key, value in sorted(parser.default_errors.items()):
body += [f" {key}:", sym(value), ",\n"]
body.append(" }\n")
# Finally, Parser construction.
body.append(
" return lr1.Parser(\n" #
" item_sets=None,\n"
" goto=goto,\n"
" action=act,\n"
f" conflicts={repr(parser.conflicts)},\n"
" terminals=None,\n"
" nonterminals=None,\n"
" productions=prods,\n"
" default_errors=defe,\n"
" )"
)
# Postprocess the output to swap the symbol placeholders for proper
# symbols.
#
# Symbol assignment is done late (here) so that one-character symbols can
# be used for the most frequently-reference objects.
#
# It would be possible to inline objects that are only used once, but there
# do not seem to be very many of those, and it would be necessary to track
# symbol definitions individually, instead of just concatenating them all
# together in symbol_defs.
symbol_idents = {} # Map of placeholder => final identifier
ident_counter = 0 # Counter used for generating identifiers
reserved_identifiers = "P S R A E prods act goto defe".split()
# Iterate through the symbols from most common to least common.
for _, _, symbol in sorted(
((v, str(type(k)), k) for k, v in symbol_uses.items()), reverse=True
):
# Find the next available identifier, skipping identifiers that are
# used by the skeleton and identifiers that are actually Python
# keywords.
while True:
ident = _identifier(ident_counter)
ident_counter += 1
if ident not in reserved_identifiers and not keyword.iskeyword(ident):
break
# Assign the final symbol to the placeholder.
symbol_idents[symbols[symbol]] = ident
# Iterate through the text fragments looking for any placeholders (S's),
# and replace the placeholders with their final symbol.
for l in (symbol_defs, body):
for i in range(len(l)):
if isinstance(l[i], S):
l[i] = symbol_idents[l[i]]
# Return the final result.
return "".join(header + symbol_defs + body)
_HEADER = """
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# GENERATED CODE. DO NOT MANUALLY EDIT. TO UPDATE, RUN:
#
# bazel run //compiler/front_end:generate_cached_parser > compiler/front_end/generated/cached_parser.py
from compiler.front_end import lr1
from compiler.util import parser_types
""".strip()
def generate_parser_file_text():
module_parser = make_parser.build_module_parser()
expression_parser = make_parser.build_expression_parser()
return "".join(
[
_HEADER,
"\n",
as_py_source(module_parser, "module_parser"),
"\n",
as_py_source(expression_parser, "expression_parser"),
"\n",
]
)
def main(argv):
print(generate_parser_file_text(), end="")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))