| // Copyright 2022 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "./grammar_codegen/antlr_frontend.h" |
| |
| #include <fstream> |
| #include <optional> |
| #include <streambuf> |
| #include <string> |
| #include <variant> |
| #include <vector> |
| |
| #include "absl/strings/ascii.h" |
| #include "absl/strings/str_format.h" |
| #include "./grammar_codegen/generated_antlr_parser/ANTLRv4Lexer.h" |
| #include "./grammar_codegen/grammar_info.h" |
| #include "./fuzztest/internal/logging.h" |
| |
| namespace fuzztest::internal::grammar { |
| |
| namespace { |
| |
| // The string literal in antlr4 is quote by '\'', we change it to '"'. |
| void ChangeStringQuote(std::string& str) { |
| if (!str.empty() && str.front() == '\'' && str.back() == '\'') { |
| str.front() = str.back() = '"'; |
| } |
| } |
| |
| std::string EscapeString(std::string_view text) { |
| std::string excape_text; |
| for (int i = 0; i < text.size(); ++i) { |
| if (text[i] == '"' || text[i] == '\\') { |
| excape_text.push_back('\\'); |
| } |
| excape_text.push_back(text[i]); |
| } |
| ChangeStringQuote(excape_text); |
| return excape_text; |
| } |
| |
| // TODO(changochen): Handle Unicode. |
| // Add '^' to the char set if it is a NOT set. |
| std::string ConstructCharSetString(std::string_view raw_str, |
| bool is_not_set = false) { |
| FUZZTEST_INTERNAL_CHECK( |
| raw_str.size() > 2 && raw_str.front() == '[' && raw_str.back() == ']', |
| "Passed argument is not a range string: `" + std::string(raw_str) + "`"); |
| std::string result = EscapeString(raw_str); |
| if (is_not_set) { |
| result = absl::StrFormat("[^%s]", result.substr(1, result.size() - 2)); |
| } |
| return result; |
| } |
| } // namespace |
| |
| void GrammarInfoBuilder::enterLexerRuleSpec( |
| ANTLRv4Parser::LexerRuleSpecContext* ctx) { |
| rules_.push_back(ConstructGrammarRule(ctx)); |
| } |
| |
| void GrammarInfoBuilder::enterGrammarSpec( |
| ANTLRv4Parser::GrammarSpecContext* ctx) { |
| // A specification file can be for lexer, grammar or both. If it is not a |
| // lexer specification file, it includes grammar and we use its name for the |
| // namespace. |
| if (ctx->grammarDecl()->grammarType()->LEXER() == nullptr) { |
| grammar_name_ = |
| absl::AsciiStrToLower(ctx->grammarDecl()->identifier()->getText()); |
| } |
| } |
| |
| void GrammarInfoBuilder::enterParserRuleSpec( |
| ANTLRv4Parser::ParserRuleSpecContext* ctx) { |
| rules_.push_back(ConstructGrammarRule(ctx)); |
| } |
| |
| Block GrammarInfoBuilder::ConstructBlock( |
| ANTLRv4Parser::LexerAtomContext* lexer_atom_ctx) { |
| Block block; |
| if (lexer_atom_ctx->terminal() && lexer_atom_ctx->terminal()->TOKEN_REF()) { |
| auto k = block.element.emplace<kNonTerminal>( |
| NonTerminal{lexer_atom_ctx->getText()}); |
| FUZZTEST_INTERNAL_CHECK(!k.name.empty(), "Empty name!"); |
| } else { |
| auto& terminal = block.element.emplace<kTerminal>(); |
| std::string text = lexer_atom_ctx->getText(); |
| if (lexer_atom_ctx->terminal()) { |
| terminal.content = EscapeString(text); |
| terminal.type = TerminalType::kStringLiteral; |
| } else if (lexer_atom_ctx->characterRange() || |
| lexer_atom_ctx->LEXER_CHAR_SET()) { |
| terminal.type = TerminalType::kCharSet; |
| terminal.content = ConstructCharSetString(lexer_atom_ctx->getText()); |
| } else if (lexer_atom_ctx->notSet()) { |
| terminal.type = TerminalType::kCharSet; |
| if (auto set_element_p = lexer_atom_ctx->notSet()->setElement()) { |
| if (set_element_p->LEXER_CHAR_SET() || |
| set_element_p->STRING_LITERAL()) { |
| terminal.content = |
| ConstructCharSetString(set_element_p->getText(), true); |
| } else { |
| FUZZTEST_INTERNAL_CHECK(false, "Not lexer char set!"); |
| } |
| } else { |
| FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!"); |
| } |
| } else if (lexer_atom_ctx->DOT()) { |
| terminal.type = TerminalType::kCharSet; |
| terminal.content = EscapeString(lexer_atom_ctx->DOT()->getText()); |
| } else { |
| FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!"); |
| } |
| } |
| return block; |
| } |
| |
| ProductionRule GrammarInfoBuilder::ConstructProductionRule( |
| ANTLRv4Parser::LexerAltContext* ctx) { |
| ProductionRule prod_rule; |
| for (auto element : ctx->lexerElements()->lexerElement()) { |
| Block block; |
| if (element->lexerBlock() != NULL) { |
| auto& sub_productions = block.element.emplace<2>(); |
| for (auto sub_rule : element->lexerBlock()->lexerAltList()->lexerAlt()) { |
| sub_productions.production_rules.push_back( |
| ConstructProductionRule(sub_rule)); |
| } |
| } else if (element->lexerAtom() != NULL) { |
| block = ConstructBlock(element->lexerAtom()); |
| } else { |
| FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!"); |
| } |
| if (element->ebnfSuffix()) { |
| block.range = ParseRange(element->ebnfSuffix()->getText()); |
| } |
| prod_rule.blocks.push_back(std::move(block)); |
| } |
| return prod_rule; |
| } |
| |
| GrammarRule GrammarInfoBuilder::ConstructGrammarRule( |
| ANTLRv4Parser::LexerRuleSpecContext* ctx) { |
| GrammarRule grammar_rule; |
| grammar_rule.symbol_name = ctx->TOKEN_REF()->getText(); |
| |
| for (auto lexer_alt : ctx->lexerRuleBlock()->lexerAltList()->lexerAlt()) { |
| grammar_rule.productions.production_rules.push_back( |
| ConstructProductionRule(lexer_alt)); |
| } |
| return grammar_rule; |
| } |
| |
| Range GrammarInfoBuilder::ParseRange(std::string_view s) { |
| return (s == "?") ? Range::kOptional |
| : (s == "+") ? Range::kNonEmpty |
| : (s == "*") ? Range::kUnlimited |
| : (FUZZTEST_INTERNAL_CHECK( |
| false, absl::StrCat("Unhandled case: ", s)), |
| Range::kNoRange); |
| } |
| |
| Block GrammarInfoBuilder::ConstructBlock( |
| ANTLRv4Parser::BlockContext* block_ctx) { |
| Block constructed_block; |
| |
| auto& sub_productions = constructed_block.element.emplace<kSubProductions>(); |
| for (auto sub_prod : block_ctx->altList()->alternative()) { |
| sub_productions.production_rules.push_back( |
| ConstructProductionRule(sub_prod)); |
| } |
| return constructed_block; |
| } |
| |
| Block GrammarInfoBuilder::ConstructBlock(ANTLRv4Parser::AtomContext* atom_ctx) { |
| Block constructed_block; |
| std::string node_name = atom_ctx->getText(); |
| if (atom_ctx->ruleref() || |
| (atom_ctx->terminal() && atom_ctx->terminal()->TOKEN_REF())) { |
| constructed_block.element.emplace<kNonTerminal>(NonTerminal{node_name}); |
| } else if (atom_ctx->terminal()) { |
| auto& terminal_node = constructed_block.element.emplace<kTerminal>(); |
| terminal_node.type = TerminalType::kStringLiteral; |
| ChangeStringQuote(node_name); |
| terminal_node.content = node_name; |
| } else { |
| FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!"); |
| } |
| return constructed_block; |
| } |
| |
| Block GrammarInfoBuilder::ConstructBlock( |
| ANTLRv4Parser::ElementContext* element) { |
| Block constructed_block; |
| std::optional<std::string> suffix; |
| if (auto atom = element->atom()) { |
| constructed_block = ConstructBlock(atom); |
| } else if (auto ebnf = element->ebnf()) { |
| constructed_block = ConstructBlock(ebnf->block()); |
| if (ebnf->blockSuffix()) { |
| suffix = ebnf->blockSuffix()->getText(); |
| } |
| } else if (auto labeled_element = element->labeledElement()) { |
| if (labeled_element->atom()) { |
| constructed_block = ConstructBlock(labeled_element->atom()); |
| } else if (labeled_element->block()) { |
| constructed_block = ConstructBlock(labeled_element->block()); |
| } else { |
| FUZZTEST_INTERNAL_CHECK(false, "Impossible case!"); |
| } |
| } else { |
| FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!"); |
| } |
| |
| if (element->ebnfSuffix()) { |
| FUZZTEST_INTERNAL_CHECK(!suffix.has_value(), |
| "It should have only one suffix."); |
| suffix = element->ebnfSuffix()->getText(); |
| } |
| |
| if (suffix.has_value()) constructed_block.range = ParseRange(*suffix); |
| return constructed_block; |
| } |
| |
| ProductionRule GrammarInfoBuilder::ConstructProductionRule( |
| ANTLRv4Parser::AlternativeContext* ctx) { |
| ProductionRule prod_rule; |
| for (auto element : ctx->element()) { |
| prod_rule.blocks.push_back(ConstructBlock(element)); |
| } |
| return prod_rule; |
| } |
| |
| GrammarRule GrammarInfoBuilder::ConstructGrammarRule( |
| ANTLRv4Parser::ParserRuleSpecContext* pctx) { |
| GrammarRule grammar_rule; |
| grammar_rule.symbol_name = pctx->RULE_REF()->getText(); |
| auto labeledAlt_vec = pctx->ruleBlock()->ruleAltList()->labeledAlt(); |
| for (auto labeled_alt : pctx->ruleBlock()->ruleAltList()->labeledAlt()) { |
| auto ctx = labeled_alt->alternative(); |
| FUZZTEST_INTERNAL_CHECK(ctx != nullptr, "Unhandeled case!"); |
| grammar_rule.productions.production_rules.push_back( |
| ConstructProductionRule(ctx)); |
| } |
| return grammar_rule; |
| } |
| |
| Grammar GrammarInfoBuilder::BuildGrammarInfo( |
| const std::vector<std::string>& input_grammar_specs) { |
| FUZZTEST_INTERNAL_CHECK_PRECONDITION(!input_grammar_specs.empty(), |
| "No input files!"); |
| for (auto& input_grammar_spec : input_grammar_specs) { |
| antlr4::ANTLRInputStream input(input_grammar_spec); |
| antlr4_grammar::ANTLRv4Lexer lexer(&input); |
| antlr4::CommonTokenStream tokens(&lexer); |
| |
| ANTLRv4Parser parser(&tokens); |
| try { |
| antlr4::tree::ParseTree* tree = parser.grammarSpec(); |
| antlr4::tree::ParseTreeWalker::DEFAULT.walk(this, tree); |
| } catch (antlr4::ParseCancellationException) { |
| FUZZTEST_INTERNAL_CHECK(false, "Cannot parse the grammar files!"); |
| } catch (...) { |
| // The ParseCancellationException might miss some errors. So we need to |
| // catch everything here. |
| FUZZTEST_INTERNAL_CHECK(false, "Unknown errors!"); |
| } |
| } |
| FUZZTEST_INTERNAL_CHECK(!grammar_name_.empty() && !rules_.empty(), |
| "Wrong grammar file!"); |
| return Grammar{std::move(grammar_name_), std::move(rules_)}; |
| } |
| |
| } // namespace fuzztest::internal::grammar |