compiler/front_end/generate_cached_parser.py - third_party/github/google/emboss - Git at Google

 #!/usr/bin/python3
 # Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import keyword
 import collections
 import sys

 from compiler.front_end import lr1
 from compiler.front_end import make_parser
 from compiler.util import parser_types


 def _identifier(i):
     """Turns a number into a Python identifier.

     Does not account for reserved words: your code will need to do so.

     Args:
         i: the number to translate

     Returns:
         A Python identifier word.
     """
     idleads = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
     idchars = idleads + "0123456789_"
     header = idleads[i % len(idleads)]
     i //= len(idleads)
     while i:
         header += idchars[i % len(idchars)]
         i //= len(idchars)
     return header


 def as_py_source(parser, function_name):
     """Turns a Parser into Python source code.

     Returns a string that is a definition of a Python function that will
     recreate the given Parser, minus data that is only used for debugging.

     This function does some work to make the source smaller, but specifically
     does not do anything that would require real processing when the function
     runs, such as decompressing a text stream.

     Args:
         parser: the Parser to serialize
         function_name: the name of the function that should recreate `parser`

     Returns:
         Source to a Python function that recreates the given Parser.
     """
     header = [
         f"def {function_name}():\n"  #
         " P = parser_types.Production\n"
         " S = lambda x: lr1.Shift(x, ())\n"
         " R = lr1.Reduce\n"
         " A = lr1.Accept\n"
         " E = lr1.Error\n"
     ]
     body = []

     S = collections.namedtuple("S", "temp_ident")

     def declaration(ident, s):
         """Returns a declaration of s with symbol-based compression."""
         if isinstance(s, parser_types.Production):
             r = [" ", ident, "=P(", sym(s.lhs), ","]
             if len(s.rhs) == 0:
                 r += ["()"]
             elif len(s.rhs) == 1:
                 r += ["(", sym(s.rhs[0]), ",)"]
             else:
                 r += ["("]
                 for rhss in s.rhs:
                     r += [sym(rhss), ","]
                 del r[-1]
                 r += [")"]
             r += [")\n"]
         else:
             r = [" ", ident, "=", repr(s), "\n"]
         return r

     # Bookkkeeping for sym()
     counter = 0  # Next available ID.
     symbols = {}  # Map of object => S() placeholder
     symbol_uses = {}  # Map of object => count of uses
     symbol_defs = []  # Map of object => declaration (may have placeholders)

     def sym(s):
         """Returns a placeholder for s."""
         nonlocal counter
         if s not in symbols:
             ident = S(counter)
             counter += 1
             symbols[s] = ident
             symbol_uses[s] = 0
             symbol_defs.extend(declaration(ident, s))
         symbol_uses[s] += 1
         return symbols[s]

     # Serialize parser.productions.  This is only used to sanity check the
     # parser when it gets loaded: if the cached parser's production list does
     # not match the production list from module_ir at load time, the cached
     # parser will be ignored and a new parser will be generated at runtime.
     #
     # This is convenient when making changes to the Emboss grammar.
     body.append(" prods = {\n")
     for production in parser.productions:
         body += ["  ", sym(production), ",\n"]
     body.append(" }\n")

     # Serialize the GOTO table, one state per line.
     body.append(" goto = {\n")
     for key_state in sorted(parser.goto.keys()):
         body.append(f"  {key_state}:" "{")
         goto_body = []
         for key_symbol, goto_state in sorted(parser.goto[key_state].items()):
             body += [sym(key_symbol), f":{goto_state}", ","]
         del body[-1]
         body.append("},\n")
     body.append(" }\n")

     # Serialize the ACTION table, one state per line.
     body.append(" act = {\n")
     for key_state in sorted(parser.action.keys()):
         body.append(f"  {key_state}:" "{")
         for key_symbol, value in sorted(parser.action[key_state].items()):
             body += [sym(key_symbol), ":"]
             if isinstance(value, lr1.Shift):
                 # The `items` are not used for actual parsing, so they are
                 # discarded here.
                 body.append(f"S({value.state})")
             elif isinstance(value, lr1.Reduce):
                 body += ["R(", sym(value.rule), ")"]
             elif isinstance(value, lr1.Accept):
                 body.append("A()")
             elif isinstance(value, lr1.Error):
                 body += ["E(", sym(value.code), ")"]
             body.append(",")
         del body[-1]
         body.append("},\n")
     body.append(" }\n")

     # Serialize the default errors map.
     body.append(" defe = {\n")
     for key, value in sorted(parser.default_errors.items()):
         body += [f"  {key}:", sym(value), ",\n"]
     body.append(" }\n")

     # Finally, Parser construction.
     body.append(
         " return lr1.Parser(\n"  #
         "  item_sets=None,\n"
         "  goto=goto,\n"
         "  action=act,\n"
         f"  conflicts={repr(parser.conflicts)},\n"
         "  terminals=None,\n"
         "  nonterminals=None,\n"
         "  productions=prods,\n"
         "  default_errors=defe,\n"
         " )"
     )

     # Postprocess the output to swap the symbol placeholders for proper
     # symbols.
     #
     # Symbol assignment is done late (here) so that one-character symbols can
     # be used for the most frequently-reference objects.
     #
     # It would be possible to inline objects that are only used once, but there
     # do not seem to be very many of those, and it would be necessary to track
     # symbol definitions individually, instead of just concatenating them all
     # together in symbol_defs.
     symbol_idents = {}  # Map of placeholder => final identifier
     ident_counter = 0  # Counter used for generating identifiers
     reserved_identifiers = "P S R A E prods act goto defe".split()

     # Iterate through the symbols from most common to least common.
     for _, _, symbol in sorted(
         ((v, str(type(k)), k) for k, v in symbol_uses.items()), reverse=True
     ):
         # Find the next available identifier, skipping identifiers that are
         # used by the skeleton and identifiers that are actually Python
         # keywords.
         while True:
             ident = _identifier(ident_counter)
             ident_counter += 1
             if ident not in reserved_identifiers and not keyword.iskeyword(ident):
                 break
         # Assign the final symbol to the placeholder.
         symbol_idents[symbols[symbol]] = ident

     # Iterate through the text fragments looking for any placeholders (S's),
     # and replace the placeholders with their final symbol.
     for l in (symbol_defs, body):
         for i in range(len(l)):
             if isinstance(l[i], S):
                 l[i] = symbol_idents[l[i]]

     # Return the final result.
     return "".join(header + symbol_defs + body)


 _HEADER = """
 # Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # GENERATED CODE.  DO NOT MANUALLY EDIT.  TO UPDATE, RUN:
 #
 #     bazel run //compiler/front_end:generate_cached_parser > compiler/front_end/generated/cached_parser.py

 from compiler.front_end import lr1
 from compiler.util import parser_types
 """.strip()


 def generate_parser_file_text():
     module_parser = make_parser.build_module_parser()
     expression_parser = make_parser.build_expression_parser()
     return "".join(
         [
             _HEADER,
             "\n",
             as_py_source(module_parser, "module_parser"),
             "\n",
             as_py_source(expression_parser, "expression_parser"),
             "\n",
         ]
     )


 def main(argv):
     print(generate_parser_file_text(), end="")
     return 0


 if __name__ == "__main__":
     sys.exit(main(sys.argv))
	#!/usr/bin/python3
	# Copyright 2024 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import keyword
	import collections
	import sys

	from compiler.front_end import lr1
	from compiler.front_end import make_parser
	from compiler.util import parser_types


	def _identifier(i):
	"""Turns a number into a Python identifier.

	Does not account for reserved words: your code will need to do so.

	Args:
	i: the number to translate

	Returns:
	A Python identifier word.
	"""
	idleads = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
	idchars = idleads + "0123456789_"
	header = idleads[i % len(idleads)]
	i //= len(idleads)
	while i:
	header += idchars[i % len(idchars)]
	i //= len(idchars)
	return header


	def as_py_source(parser, function_name):
	"""Turns a Parser into Python source code.

	Returns a string that is a definition of a Python function that will
	recreate the given Parser, minus data that is only used for debugging.

	This function does some work to make the source smaller, but specifically
	does not do anything that would require real processing when the function
	runs, such as decompressing a text stream.

	Args:
	parser: the Parser to serialize
	function_name: the name of the function that should recreate `parser`

	Returns:
	Source to a Python function that recreates the given Parser.
	"""
	header = [
	f"def {function_name}():\n" #
	" P = parser_types.Production\n"
	" S = lambda x: lr1.Shift(x, ())\n"
	" R = lr1.Reduce\n"
	" A = lr1.Accept\n"
	" E = lr1.Error\n"
	]
	body = []

	S = collections.namedtuple("S", "temp_ident")

	def declaration(ident, s):
	"""Returns a declaration of s with symbol-based compression."""
	if isinstance(s, parser_types.Production):
	r = [" ", ident, "=P(", sym(s.lhs), ","]
	if len(s.rhs) == 0:
	r += ["()"]
	elif len(s.rhs) == 1:
	r += ["(", sym(s.rhs[0]), ",)"]
	else:
	r += ["("]
	for rhss in s.rhs:
	r += [sym(rhss), ","]
	del r[-1]
	r += [")"]
	r += [")\n"]
	else:
	r = [" ", ident, "=", repr(s), "\n"]
	return r

	# Bookkkeeping for sym()
	counter = 0 # Next available ID.
	symbols = {} # Map of object => S() placeholder
	symbol_uses = {} # Map of object => count of uses
	symbol_defs = [] # Map of object => declaration (may have placeholders)

	def sym(s):
	"""Returns a placeholder for s."""
	nonlocal counter
	if s not in symbols:
	ident = S(counter)
	counter += 1
	symbols[s] = ident
	symbol_uses[s] = 0
	symbol_defs.extend(declaration(ident, s))
	symbol_uses[s] += 1
	return symbols[s]

	# Serialize parser.productions. This is only used to sanity check the
	# parser when it gets loaded: if the cached parser's production list does
	# not match the production list from module_ir at load time, the cached
	# parser will be ignored and a new parser will be generated at runtime.
	#
	# This is convenient when making changes to the Emboss grammar.
	body.append(" prods = {\n")
	for production in parser.productions:
	body += [" ", sym(production), ",\n"]
	body.append(" }\n")

	# Serialize the GOTO table, one state per line.
	body.append(" goto = {\n")
	for key_state in sorted(parser.goto.keys()):
	body.append(f" {key_state}:" "{")
	goto_body = []
	for key_symbol, goto_state in sorted(parser.goto[key_state].items()):
	body += [sym(key_symbol), f":{goto_state}", ","]
	del body[-1]
	body.append("},\n")
	body.append(" }\n")

	# Serialize the ACTION table, one state per line.
	body.append(" act = {\n")
	for key_state in sorted(parser.action.keys()):
	body.append(f" {key_state}:" "{")
	for key_symbol, value in sorted(parser.action[key_state].items()):
	body += [sym(key_symbol), ":"]
	if isinstance(value, lr1.Shift):
	# The `items` are not used for actual parsing, so they are
	# discarded here.
	body.append(f"S({value.state})")
	elif isinstance(value, lr1.Reduce):
	body += ["R(", sym(value.rule), ")"]
	elif isinstance(value, lr1.Accept):
	body.append("A()")
	elif isinstance(value, lr1.Error):
	body += ["E(", sym(value.code), ")"]
	body.append(",")
	del body[-1]
	body.append("},\n")
	body.append(" }\n")

	# Serialize the default errors map.
	body.append(" defe = {\n")
	for key, value in sorted(parser.default_errors.items()):
	body += [f" {key}:", sym(value), ",\n"]
	body.append(" }\n")

	# Finally, Parser construction.
	body.append(
	" return lr1.Parser(\n" #
	" item_sets=None,\n"
	" goto=goto,\n"
	" action=act,\n"
	f" conflicts={repr(parser.conflicts)},\n"
	" terminals=None,\n"
	" nonterminals=None,\n"
	" productions=prods,\n"
	" default_errors=defe,\n"
	" )"
	)

	# Postprocess the output to swap the symbol placeholders for proper
	# symbols.
	#
	# Symbol assignment is done late (here) so that one-character symbols can
	# be used for the most frequently-reference objects.
	#
	# It would be possible to inline objects that are only used once, but there
	# do not seem to be very many of those, and it would be necessary to track
	# symbol definitions individually, instead of just concatenating them all
	# together in symbol_defs.
	symbol_idents = {} # Map of placeholder => final identifier
	ident_counter = 0 # Counter used for generating identifiers
	reserved_identifiers = "P S R A E prods act goto defe".split()

	# Iterate through the symbols from most common to least common.
	for _, _, symbol in sorted(
	((v, str(type(k)), k) for k, v in symbol_uses.items()), reverse=True
	):
	# Find the next available identifier, skipping identifiers that are
	# used by the skeleton and identifiers that are actually Python
	# keywords.
	while True:
	ident = _identifier(ident_counter)
	ident_counter += 1
	if ident not in reserved_identifiers and not keyword.iskeyword(ident):
	break
	# Assign the final symbol to the placeholder.
	symbol_idents[symbols[symbol]] = ident

	# Iterate through the text fragments looking for any placeholders (S's),
	# and replace the placeholders with their final symbol.
	for l in (symbol_defs, body):
	for i in range(len(l)):
	if isinstance(l[i], S):
	l[i] = symbol_idents[l[i]]

	# Return the final result.
	return "".join(header + symbol_defs + body)


	_HEADER = """
	# Copyright 2024 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# GENERATED CODE. DO NOT MANUALLY EDIT. TO UPDATE, RUN:
	#
	# bazel run //compiler/front_end:generate_cached_parser > compiler/front_end/generated/cached_parser.py

	from compiler.front_end import lr1
	from compiler.util import parser_types
	""".strip()


	def generate_parser_file_text():
	module_parser = make_parser.build_module_parser()
	expression_parser = make_parser.build_expression_parser()
	return "".join(
	[
	_HEADER,
	"\n",
	as_py_source(module_parser, "module_parser"),
	"\n",
	as_py_source(expression_parser, "expression_parser"),
	"\n",
	]
	)


	def main(argv):
	print(generate_parser_file_text(), end="")
	return 0


	if __name__ == "__main__":
	sys.exit(main(sys.argv))