| # Copyright 2019 Google LLC |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Main driver for the Emboss front-end. |
| |
| The parse_emboss_file function performs a complete parse of the specified file, |
| and returns an IR or formatted error message. |
| """ |
| |
| import collections |
| import pkgutil |
| |
| from compiler.front_end import attribute_checker |
| from compiler.front_end import constraints |
| from compiler.front_end import dependency_checker |
| from compiler.front_end import expression_bounds |
| from compiler.front_end import lr1 |
| from compiler.front_end import module_ir |
| from compiler.front_end import parser |
| from compiler.front_end import symbol_resolver |
| from compiler.front_end import synthetics |
| from compiler.front_end import tokenizer |
| from compiler.front_end import type_check |
| from compiler.front_end import write_inference |
| from compiler.util import ir_pb2 |
| from compiler.util import error |
| from compiler.util import parser_types |
| |
| _IrDebugInfo = collections.namedtuple("IrDebugInfo", ["ir", "debug_info", |
| "errors"]) |
| |
| |
| class DebugInfo(object): |
| """Debug information about Emboss parsing.""" |
| __slots__ = ("modules") |
| |
| def __init__(self): |
| self.modules = {} |
| |
| def __eq__(self, other): |
| return self.modules == other.modules |
| |
| def __ne__(self, other): |
| return not self == other |
| |
| |
| class ModuleDebugInfo(object): |
| """Debug information about the parse of a single file. |
| |
| Attributes: |
| file_name: The name of the file from which this module came. |
| tokens: The tokenization of this module's source text. |
| parse_tree: The raw parse tree for this module. |
| ir: The intermediate representation of this module, before additional |
| processing such as symbol resolution. |
| used_productions: The set of grammar productions used when parsing this |
| module. |
| source_code: The source text of the module. |
| """ |
| __slots__ = ("file_name", "tokens", "parse_tree", "ir", "used_productions", |
| "source_code") |
| |
| def __init__(self, file_name): |
| self.file_name = file_name |
| self.tokens = None |
| self.parse_tree = None |
| self.ir = None |
| self.used_productions = None |
| self.source_code = None |
| |
| def __eq__(self, other): |
| return (self.file_name == other.file_name and self.tokens == other.tokens |
| and self.parse_tree == other.parse_tree and self.ir == other.ir and |
| self.used_productions == other.used_productions and |
| self.source_code == other.source_code) |
| |
| def __ne__(self, other): |
| return not self == other |
| |
| def format_tokenization(self): |
| """Renders self.tokens in a human-readable format.""" |
| return "\n".join([str(token) for token in self.tokens]) |
| |
| def format_parse_tree(self, parse_tree=None, indent=""): |
| """Renders self.parse_tree in a human-readable format.""" |
| if parse_tree is None: |
| parse_tree = self.parse_tree |
| result = [] |
| if isinstance(parse_tree, lr1.Reduction): |
| result.append(indent + parse_tree.symbol) |
| if parse_tree.children: |
| result.append(":\n") |
| for child in parse_tree.children: |
| result.append(self.format_parse_tree(child, indent + " ")) |
| else: |
| result.append("\n") |
| else: |
| result.append("{}{}\n".format(indent, parse_tree)) |
| return "".join(result) |
| |
| def format_module_ir(self): |
| """Renders self.ir in a human-readable format.""" |
| return self.ir.to_json(indent=2) |
| |
| |
| def format_production_set(productions): |
| """Renders a set of productions in a human-readable format.""" |
| return "\n".join([str(production) for production in sorted(productions)]) |
| |
| |
| _cached_modules = {} |
| |
| |
| def parse_module_text(source_code, file_name): |
| """Parses the text of a module, returning a module-level IR. |
| |
| Arguments: |
| source_code: The text of the module to parse. |
| file_name: The name of the module's source file (will be included in the |
| resulting IR). |
| |
| Returns: |
| A module-level intermediate representation (IR), prior to import and symbol |
| resolution, and a corresponding ModuleDebugInfo, for debugging the parser. |
| |
| Raises: |
| FrontEndFailure: An error occurred while parsing the module. str(error) |
| will give a human-readable error message. |
| """ |
| # This is strictly an optimization to speed up tests, mostly by avoiding the |
| # need to re-parse the prelude for every test .emb. |
| if (source_code, file_name) in _cached_modules: |
| debug_info = _cached_modules[source_code, file_name] |
| ir = ir_pb2.Module() |
| ir.CopyFrom(debug_info.ir) |
| else: |
| debug_info = ModuleDebugInfo(file_name) |
| debug_info.source_code = source_code |
| tokens, errors = tokenizer.tokenize(source_code, file_name) |
| if errors: |
| return _IrDebugInfo(None, debug_info, errors) |
| debug_info.tokens = tokens |
| parse_result = parser.parse_module(tokens) |
| if parse_result.error: |
| return _IrDebugInfo( |
| None, |
| debug_info, |
| [error.make_error_from_parse_error(file_name, parse_result.error)]) |
| debug_info.parse_tree = parse_result.parse_tree |
| used_productions = set() |
| ir = module_ir.build_ir(parse_result.parse_tree, used_productions) |
| debug_info.used_productions = used_productions |
| debug_info.ir = ir_pb2.Module() |
| debug_info.ir.CopyFrom(ir) |
| _cached_modules[source_code, file_name] = debug_info |
| ir.source_file_name = file_name |
| return _IrDebugInfo(ir, debug_info, []) |
| |
| |
| def parse_module(file_name, file_reader): |
| """Parses a module, returning a module-level IR. |
| |
| Arguments: |
| file_name: The name of the module's source file. |
| file_reader: A callable that returns either: |
| (file_contents, None) or |
| (None, list_of_error_detail_strings) |
| |
| Returns: |
| (ir, debug_info, errors), where ir is a module-level intermediate |
| representation (IR), debug_info is a ModuleDebugInfo containing the |
| tokenization, parse tree, and original source text of all modules, and |
| errors is a list of tokenization or parse errors. If errors is not an empty |
| list, ir will be None. |
| |
| Raises: |
| FrontEndFailure: An error occurred while reading or parsing the module. |
| str(error) will give a human-readable error message. |
| """ |
| source_code, errors = file_reader(file_name) |
| if errors: |
| location = parser_types.make_location((1, 1), (1, 1)) |
| return None, None, [ |
| [error.error(file_name, location, "Unable to read file.")] + |
| [error.note(file_name, location, e) for e in errors] |
| ] |
| return parse_module_text(source_code, file_name) |
| |
| |
| def get_prelude(): |
| """Returns the module IR and debug info of the Emboss Prelude.""" |
| return parse_module_text( |
| pkgutil.get_data("compiler.front_end", |
| "prelude.emb").decode(encoding="UTF-8"), |
| "") |
| |
| |
| def parse_emboss_file(file_name, file_reader, stop_before_step=None): |
| """Fully parses an .emb, and returns an IR suitable for passing to a back end. |
| |
| parse_emboss_file is a convenience function which calls only_parse_emboss_file |
| and process_ir. |
| |
| Arguments: |
| file_name: The name of the module's source file. |
| file_reader: A callable that returns the contents of files, or raises |
| IOError. |
| stop_before_step: If set, parse_emboss_file will stop normalizing the IR |
| just before the specified step. This parameter should be None for |
| non-test code. |
| |
| Returns: |
| (ir, debug_info, errors), where ir is a complete IR, ready for consumption |
| by an Emboss back end, debug_info is a DebugInfo containing the |
| tokenization, parse tree, and original source text of all modules, and |
| errors is a list of tokenization or parse errors. If errors is not an empty |
| list, ir will be None. |
| """ |
| ir, debug_info, errors = only_parse_emboss_file(file_name, file_reader) |
| if errors: |
| return _IrDebugInfo(None, debug_info, errors) |
| ir, errors = process_ir(ir, stop_before_step) |
| if errors: |
| return _IrDebugInfo(None, debug_info, errors) |
| return _IrDebugInfo(ir, debug_info, errors) |
| |
| |
| def only_parse_emboss_file(file_name, file_reader): |
| """Parses an .emb, and returns an IR suitable for process_ir. |
| |
| only_parse_emboss_file parses the given file and all of its transitive |
| imports, and returns a first-stage intermediate representation, which can be |
| passed to process_ir. |
| |
| Arguments: |
| file_name: The name of the module's source file. |
| file_reader: A callable that returns the contents of files, or raises |
| IOError. |
| |
| Returns: |
| (ir, debug_info, errors), where ir is an intermediate representation (IR), |
| debug_info is a DebugInfo containing the tokenization, parse tree, and |
| original source text of all modules, and errors is a list of tokenization or |
| parse errors. If errors is not an empty list, ir will be None. |
| """ |
| file_queue = [file_name] |
| files = {file_name} |
| debug_info = DebugInfo() |
| ir = ir_pb2.EmbossIr(module=[]) |
| while file_queue: |
| file_to_parse = file_queue[0] |
| del file_queue[0] |
| if file_to_parse: |
| module, module_debug_info, errors = parse_module(file_to_parse, |
| file_reader) |
| else: |
| module, module_debug_info, errors = get_prelude() |
| if module_debug_info: |
| debug_info.modules[file_to_parse] = module_debug_info |
| if errors: |
| return _IrDebugInfo(None, debug_info, errors) |
| ir.module.extend([module]) # Proto supports extend but not append here. |
| for import_ in module.foreign_import: |
| if import_.file_name.text not in files: |
| file_queue.append(import_.file_name.text) |
| files.add(import_.file_name.text) |
| return _IrDebugInfo(ir, debug_info, []) |
| |
| |
| def process_ir(ir, stop_before_step): |
| """Turns a first-stage IR into a fully-processed IR. |
| |
| process_ir performs all of the semantic processing steps on `ir`: resolving |
| symbols, checking dependencies, adding type annotations, normalizing |
| attributes, etc. process_ir is generally meant to be called with the result |
| of parse_emboss_file(), but in theory could be called with a first-stage |
| intermediate representation (IR) from another source. |
| |
| Arguments: |
| ir: The IR to process. This structure will be modified during processing. |
| stop_before_step: If set, process_ir will stop normalizing the IR just |
| before the specified step. This parameter should be None for non-test |
| code. |
| |
| Returns: |
| (ir, errors), where ir is a complete IR, ready for consumption by an Emboss |
| back end, and errors is a list of compilation errors. If errors is not an |
| empty list, ir will be None. |
| """ |
| passes = (synthetics.synthesize_fields, |
| symbol_resolver.resolve_symbols, |
| dependency_checker.find_dependency_cycles, |
| dependency_checker.set_dependency_order, |
| symbol_resolver.resolve_field_references, |
| type_check.annotate_types, |
| type_check.check_types, |
| expression_bounds.compute_constants, |
| attribute_checker.normalize_and_verify, |
| constraints.check_constraints, |
| write_inference.set_write_methods) |
| assert stop_before_step in [None] + [f.__name__ for f in passes], ( |
| "Bad value for stop_before_step.") |
| # Some parts of the IR are synthesized from "natural" parts of the IR, before |
| # the natural parts have been fully error checked. Because of this, the |
| # synthesized parts can have errors; in a couple of cases, they can have |
| # errors that show up in an earlier pass than the errors in the natural parts |
| # of the IR. As an example: |
| # |
| # struct Foo: |
| # 0 [+1] bits: |
| # 0 [+1] Flag flag |
| # 1 [+flag] UInt:8 field |
| # |
| # In this case, the use of `flag` as the size of `field` is incorrect, because |
| # `flag` is a boolean, but the size of a field must be an integer. |
| # |
| # Type checking occurs in two passes: in the first pass, expressions are |
| # checked for internal consistency. In the second pass, expression types are |
| # checked against their location. The use of `flag` would be caught in the |
| # second pass. |
| # |
| # However, the generated_fields pass will synthesize a $size_in_bytes virtual |
| # field that would look like: |
| # |
| # struct Foo: |
| # 0 [+1] bits: |
| # 0 [+1] Flag flag |
| # 1 [+flag] UInt:8 field |
| # let $size_in_bytes = $max(true ? 0 + 1 : 0, true ? 1 + flag : 0) |
| # |
| # Since `1 + flag` is not internally consistent, this type error would be |
| # caught in the first pass, and the user would see a very strange error |
| # message that "the right-hand argument of operator `+` must be an integer." |
| # |
| # In order to avoid showing these kinds of errors to the user, we defer any |
| # errors in synthetic parts of the IR. Unless there is a compiler bug, those |
| # errors will show up as errors in the natural parts of the IR, which should |
| # be much more comprehensible to end users. |
| # |
| # If, for some reason, there is an error in the synthetic IR, but no error in |
| # the natural IR, the synthetic errors will be shown. In this case, the |
| # formatting for the synthetic errors will show '[compiler bug]' for the |
| # error location, which (hopefully) will provide the end user with a cue that |
| # the error is a compiler bug. |
| deferred_errors = [] |
| for function in passes: |
| if stop_before_step == function.__name__: |
| return (ir, []) |
| errors, hidden_errors = error.split_errors(function(ir)) |
| if errors: |
| return (None, errors) |
| deferred_errors.extend(hidden_errors) |
| |
| if deferred_errors: |
| return (None, deferred_errors) |
| |
| assert stop_before_step is None, "Bad value for stop_before_step." |
| return (ir, []) |