grammar_codegen/antlr_frontend.cc - third_party/github/google/fuzztest - Git at Google

 // Copyright 2022 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "./grammar_codegen/antlr_frontend.h"

 #include <fstream>
 #include <optional>
 #include <streambuf>
 #include <string>
 #include <variant>
 #include <vector>

 #include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
 #include "./grammar_codegen/generated_antlr_parser/ANTLRv4Lexer.h"
 #include "./grammar_codegen/grammar_info.h"
 #include "./fuzztest/internal/logging.h"

 namespace fuzztest::internal::grammar {

 namespace {

 // The string literal in antlr4 is quote by '\'', we change it to '"'.
 void ChangeStringQuote(std::string& str) {
   if (!str.empty() && str.front() == '\'' && str.back() == '\'') {
     str.front() = str.back() = '"';
   }
 }

 std::string EscapeString(std::string_view text) {
   std::string excape_text;
   for (int i = 0; i < text.size(); ++i) {
     if (text[i] == '"' || text[i] == '\\') {
       excape_text.push_back('\\');
     }
     excape_text.push_back(text[i]);
   }
   ChangeStringQuote(excape_text);
   return excape_text;
 }

 // TODO(changochen): Handle Unicode.
 // Add '^' to the char set if it is a NOT set.
 std::string ConstructCharSetString(std::string_view raw_str,
                                    bool is_not_set = false) {
   FUZZTEST_INTERNAL_CHECK(
       raw_str.size() > 2 && raw_str.front() == '[' && raw_str.back() == ']',
       "Passed argument is not a range string: `" + std::string(raw_str) + "`");
   std::string result = EscapeString(raw_str);
   if (is_not_set) {
     result = absl::StrFormat("[^%s]", result.substr(1, result.size() - 2));
   }
   return result;
 }
 }  // namespace

 void GrammarInfoBuilder::enterLexerRuleSpec(
     ANTLRv4Parser::LexerRuleSpecContext* ctx) {
   rules_.push_back(ConstructGrammarRule(ctx));
 }

 void GrammarInfoBuilder::enterGrammarSpec(
     ANTLRv4Parser::GrammarSpecContext* ctx) {
   // A specification file can be for lexer, grammar or both. If it is not a
   // lexer specification file, it includes grammar and we use its name for the
   // namespace.
   if (ctx->grammarDecl()->grammarType()->LEXER() == nullptr) {
     grammar_name_ =
         absl::AsciiStrToLower(ctx->grammarDecl()->identifier()->getText());
   }
 }

 void GrammarInfoBuilder::enterParserRuleSpec(
     ANTLRv4Parser::ParserRuleSpecContext* ctx) {
   rules_.push_back(ConstructGrammarRule(ctx));
 }

 Block GrammarInfoBuilder::ConstructBlock(
     ANTLRv4Parser::LexerAtomContext* lexer_atom_ctx) {
   Block block;
   if (lexer_atom_ctx->terminal() && lexer_atom_ctx->terminal()->TOKEN_REF()) {
     auto k = block.element.emplace<kNonTerminal>(
         NonTerminal{lexer_atom_ctx->getText()});
     FUZZTEST_INTERNAL_CHECK(!k.name.empty(), "Empty name!");
   } else {
     auto& terminal = block.element.emplace<kTerminal>();
     std::string text = lexer_atom_ctx->getText();
     if (lexer_atom_ctx->terminal()) {
       terminal.content = EscapeString(text);
       terminal.type = TerminalType::kStringLiteral;
     } else if (lexer_atom_ctx->characterRange() ||
                lexer_atom_ctx->LEXER_CHAR_SET()) {
       terminal.type = TerminalType::kCharSet;
       terminal.content = ConstructCharSetString(lexer_atom_ctx->getText());
     } else if (lexer_atom_ctx->notSet()) {
       terminal.type = TerminalType::kCharSet;
       if (auto set_element_p = lexer_atom_ctx->notSet()->setElement()) {
         if (set_element_p->LEXER_CHAR_SET() ||
             set_element_p->STRING_LITERAL()) {
           terminal.content =
               ConstructCharSetString(set_element_p->getText(), true);
         } else {
           FUZZTEST_INTERNAL_CHECK(false, "Not lexer char set!");
         }
       } else {
         FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
       }
     } else if (lexer_atom_ctx->DOT()) {
       terminal.type = TerminalType::kCharSet;
       terminal.content = EscapeString(lexer_atom_ctx->DOT()->getText());
     } else {
       FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
     }
   }
   return block;
 }

 ProductionRule GrammarInfoBuilder::ConstructProductionRule(
     ANTLRv4Parser::LexerAltContext* ctx) {
   ProductionRule prod_rule;
   for (auto element : ctx->lexerElements()->lexerElement()) {
     Block block;
     if (element->lexerBlock() != NULL) {
       auto& sub_productions = block.element.emplace<2>();
       for (auto sub_rule : element->lexerBlock()->lexerAltList()->lexerAlt()) {
         sub_productions.production_rules.push_back(
             ConstructProductionRule(sub_rule));
       }
     } else if (element->lexerAtom() != NULL) {
       block = ConstructBlock(element->lexerAtom());
     } else {
       FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
     }
     if (element->ebnfSuffix()) {
       block.range = ParseRange(element->ebnfSuffix()->getText());
     }
     prod_rule.blocks.push_back(std::move(block));
   }
   return prod_rule;
 }

 GrammarRule GrammarInfoBuilder::ConstructGrammarRule(
     ANTLRv4Parser::LexerRuleSpecContext* ctx) {
   GrammarRule grammar_rule;
   grammar_rule.symbol_name = ctx->TOKEN_REF()->getText();

   for (auto lexer_alt : ctx->lexerRuleBlock()->lexerAltList()->lexerAlt()) {
     grammar_rule.productions.production_rules.push_back(
         ConstructProductionRule(lexer_alt));
   }
   return grammar_rule;
 }

 Range GrammarInfoBuilder::ParseRange(std::string_view s) {
   return (s == "?")   ? Range::kOptional
          : (s == "+") ? Range::kNonEmpty
          : (s == "*") ? Range::kUnlimited
                       : (FUZZTEST_INTERNAL_CHECK(
                              false, absl::StrCat("Unhandled case: ", s)),
                          Range::kNoRange);
 }

 Block GrammarInfoBuilder::ConstructBlock(
     ANTLRv4Parser::BlockContext* block_ctx) {
   Block constructed_block;

   auto& sub_productions = constructed_block.element.emplace<kSubProductions>();
   for (auto sub_prod : block_ctx->altList()->alternative()) {
     sub_productions.production_rules.push_back(
         ConstructProductionRule(sub_prod));
   }
   return constructed_block;
 }

 Block GrammarInfoBuilder::ConstructBlock(ANTLRv4Parser::AtomContext* atom_ctx) {
   Block constructed_block;
   std::string node_name = atom_ctx->getText();
   if (atom_ctx->ruleref() ||
       (atom_ctx->terminal() && atom_ctx->terminal()->TOKEN_REF())) {
     constructed_block.element.emplace<kNonTerminal>(NonTerminal{node_name});
   } else if (atom_ctx->terminal()) {
     auto& terminal_node = constructed_block.element.emplace<kTerminal>();
     terminal_node.type = TerminalType::kStringLiteral;
     ChangeStringQuote(node_name);
     terminal_node.content = node_name;
   } else {
     FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
   }
   return constructed_block;
 }

 Block GrammarInfoBuilder::ConstructBlock(
     ANTLRv4Parser::ElementContext* element) {
   Block constructed_block;
   std::optional<std::string> suffix;
   if (auto atom = element->atom()) {
     constructed_block = ConstructBlock(atom);
   } else if (auto ebnf = element->ebnf()) {
     constructed_block = ConstructBlock(ebnf->block());
     if (ebnf->blockSuffix()) {
       suffix = ebnf->blockSuffix()->getText();
     }
   } else if (auto labeled_element = element->labeledElement()) {
     if (labeled_element->atom()) {
       constructed_block = ConstructBlock(labeled_element->atom());
     } else if (labeled_element->block()) {
       constructed_block = ConstructBlock(labeled_element->block());
     } else {
       FUZZTEST_INTERNAL_CHECK(false, "Impossible case!");
     }
   } else {
     FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
   }

   if (element->ebnfSuffix()) {
     FUZZTEST_INTERNAL_CHECK(!suffix.has_value(),
                             "It should have only one suffix.");
     suffix = element->ebnfSuffix()->getText();
   }

   if (suffix.has_value()) constructed_block.range = ParseRange(*suffix);
   return constructed_block;
 }

 ProductionRule GrammarInfoBuilder::ConstructProductionRule(
     ANTLRv4Parser::AlternativeContext* ctx) {
   ProductionRule prod_rule;
   for (auto element : ctx->element()) {
     prod_rule.blocks.push_back(ConstructBlock(element));
   }
   return prod_rule;
 }

 GrammarRule GrammarInfoBuilder::ConstructGrammarRule(
     ANTLRv4Parser::ParserRuleSpecContext* pctx) {
   GrammarRule grammar_rule;
   grammar_rule.symbol_name = pctx->RULE_REF()->getText();
   auto labeledAlt_vec = pctx->ruleBlock()->ruleAltList()->labeledAlt();
   for (auto labeled_alt : pctx->ruleBlock()->ruleAltList()->labeledAlt()) {
     auto ctx = labeled_alt->alternative();
     FUZZTEST_INTERNAL_CHECK(ctx != nullptr, "Unhandeled case!");
     grammar_rule.productions.production_rules.push_back(
         ConstructProductionRule(ctx));
   }
   return grammar_rule;
 }

 Grammar GrammarInfoBuilder::BuildGrammarInfo(
     const std::vector<std::string>& input_grammar_specs) {
   FUZZTEST_INTERNAL_CHECK_PRECONDITION(!input_grammar_specs.empty(),
                                        "No input files!");
   for (auto& input_grammar_spec : input_grammar_specs) {
     antlr4::ANTLRInputStream input(input_grammar_spec);
     antlr4_grammar::ANTLRv4Lexer lexer(&input);
     antlr4::CommonTokenStream tokens(&lexer);

     ANTLRv4Parser parser(&tokens);
     try {
       antlr4::tree::ParseTree* tree = parser.grammarSpec();
       antlr4::tree::ParseTreeWalker::DEFAULT.walk(this, tree);
     } catch (antlr4::ParseCancellationException) {
       FUZZTEST_INTERNAL_CHECK(false, "Cannot parse the grammar files!");
     } catch (...) {
       // The ParseCancellationException might miss some errors. So we need to
       // catch everything here.
       FUZZTEST_INTERNAL_CHECK(false, "Unknown errors!");
     }
   }
   FUZZTEST_INTERNAL_CHECK(!grammar_name_.empty() && !rules_.empty(),
                           "Wrong grammar file!");
   return Grammar{std::move(grammar_name_), std::move(rules_)};
 }

 }  // namespace fuzztest::internal::grammar
	// Copyright 2022 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "./grammar_codegen/antlr_frontend.h"

	#include <fstream>
	#include <optional>
	#include <streambuf>
	#include <string>
	#include <variant>
	#include <vector>

	#include "absl/strings/ascii.h"
	#include "absl/strings/str_format.h"
	#include "./grammar_codegen/generated_antlr_parser/ANTLRv4Lexer.h"
	#include "./grammar_codegen/grammar_info.h"
	#include "./fuzztest/internal/logging.h"

	namespace fuzztest::internal::grammar {

	namespace {

	// The string literal in antlr4 is quote by '\'', we change it to '"'.
	void ChangeStringQuote(std::string& str) {
	if (!str.empty() && str.front() == '\'' && str.back() == '\'') {
	str.front() = str.back() = '"';
	}
	}

	std::string EscapeString(std::string_view text) {
	std::string excape_text;
	for (int i = 0; i < text.size(); ++i) {
	if (text[i] == '"' \|\| text[i] == '\\') {
	excape_text.push_back('\\');
	}
	excape_text.push_back(text[i]);
	}
	ChangeStringQuote(excape_text);
	return excape_text;
	}

	// TODO(changochen): Handle Unicode.
	// Add '^' to the char set if it is a NOT set.
	std::string ConstructCharSetString(std::string_view raw_str,
	bool is_not_set = false) {
	FUZZTEST_INTERNAL_CHECK(
	raw_str.size() > 2 && raw_str.front() == '[' && raw_str.back() == ']',
	"Passed argument is not a range string: `" + std::string(raw_str) + "`");
	std::string result = EscapeString(raw_str);
	if (is_not_set) {
	result = absl::StrFormat("[^%s]", result.substr(1, result.size() - 2));
	}
	return result;
	}
	} // namespace

	void GrammarInfoBuilder::enterLexerRuleSpec(
	ANTLRv4Parser::LexerRuleSpecContext* ctx) {
	rules_.push_back(ConstructGrammarRule(ctx));
	}

	void GrammarInfoBuilder::enterGrammarSpec(
	ANTLRv4Parser::GrammarSpecContext* ctx) {
	// A specification file can be for lexer, grammar or both. If it is not a
	// lexer specification file, it includes grammar and we use its name for the
	// namespace.
	if (ctx->grammarDecl()->grammarType()->LEXER() == nullptr) {
	grammar_name_ =
	absl::AsciiStrToLower(ctx->grammarDecl()->identifier()->getText());
	}
	}

	void GrammarInfoBuilder::enterParserRuleSpec(
	ANTLRv4Parser::ParserRuleSpecContext* ctx) {
	rules_.push_back(ConstructGrammarRule(ctx));
	}

	Block GrammarInfoBuilder::ConstructBlock(
	ANTLRv4Parser::LexerAtomContext* lexer_atom_ctx) {
	Block block;
	if (lexer_atom_ctx->terminal() && lexer_atom_ctx->terminal()->TOKEN_REF()) {
	auto k = block.element.emplace<kNonTerminal>(
	NonTerminal{lexer_atom_ctx->getText()});
	FUZZTEST_INTERNAL_CHECK(!k.name.empty(), "Empty name!");
	} else {
	auto& terminal = block.element.emplace<kTerminal>();
	std::string text = lexer_atom_ctx->getText();
	if (lexer_atom_ctx->terminal()) {
	terminal.content = EscapeString(text);
	terminal.type = TerminalType::kStringLiteral;
	} else if (lexer_atom_ctx->characterRange() \|\|
	lexer_atom_ctx->LEXER_CHAR_SET()) {
	terminal.type = TerminalType::kCharSet;
	terminal.content = ConstructCharSetString(lexer_atom_ctx->getText());
	} else if (lexer_atom_ctx->notSet()) {
	terminal.type = TerminalType::kCharSet;
	if (auto set_element_p = lexer_atom_ctx->notSet()->setElement()) {
	if (set_element_p->LEXER_CHAR_SET() \|\|
	set_element_p->STRING_LITERAL()) {
	terminal.content =
	ConstructCharSetString(set_element_p->getText(), true);
	} else {
	FUZZTEST_INTERNAL_CHECK(false, "Not lexer char set!");
	}
	} else {
	FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
	}
	} else if (lexer_atom_ctx->DOT()) {
	terminal.type = TerminalType::kCharSet;
	terminal.content = EscapeString(lexer_atom_ctx->DOT()->getText());
	} else {
	FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
	}
	}
	return block;
	}

	ProductionRule GrammarInfoBuilder::ConstructProductionRule(
	ANTLRv4Parser::LexerAltContext* ctx) {
	ProductionRule prod_rule;
	for (auto element : ctx->lexerElements()->lexerElement()) {
	Block block;
	if (element->lexerBlock() != NULL) {
	auto& sub_productions = block.element.emplace<2>();
	for (auto sub_rule : element->lexerBlock()->lexerAltList()->lexerAlt()) {
	sub_productions.production_rules.push_back(
	ConstructProductionRule(sub_rule));
	}
	} else if (element->lexerAtom() != NULL) {
	block = ConstructBlock(element->lexerAtom());
	} else {
	FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
	}
	if (element->ebnfSuffix()) {
	block.range = ParseRange(element->ebnfSuffix()->getText());
	}
	prod_rule.blocks.push_back(std::move(block));
	}
	return prod_rule;
	}

	GrammarRule GrammarInfoBuilder::ConstructGrammarRule(
	ANTLRv4Parser::LexerRuleSpecContext* ctx) {
	GrammarRule grammar_rule;
	grammar_rule.symbol_name = ctx->TOKEN_REF()->getText();

	for (auto lexer_alt : ctx->lexerRuleBlock()->lexerAltList()->lexerAlt()) {
	grammar_rule.productions.production_rules.push_back(
	ConstructProductionRule(lexer_alt));
	}
	return grammar_rule;
	}

	Range GrammarInfoBuilder::ParseRange(std::string_view s) {
	return (s == "?") ? Range::kOptional
	: (s == "+") ? Range::kNonEmpty
	: (s == "*") ? Range::kUnlimited
	: (FUZZTEST_INTERNAL_CHECK(
	false, absl::StrCat("Unhandled case: ", s)),
	Range::kNoRange);
	}

	Block GrammarInfoBuilder::ConstructBlock(
	ANTLRv4Parser::BlockContext* block_ctx) {
	Block constructed_block;

	auto& sub_productions = constructed_block.element.emplace<kSubProductions>();
	for (auto sub_prod : block_ctx->altList()->alternative()) {
	sub_productions.production_rules.push_back(
	ConstructProductionRule(sub_prod));
	}
	return constructed_block;
	}

	Block GrammarInfoBuilder::ConstructBlock(ANTLRv4Parser::AtomContext* atom_ctx) {
	Block constructed_block;
	std::string node_name = atom_ctx->getText();
	if (atom_ctx->ruleref() \|\|
	(atom_ctx->terminal() && atom_ctx->terminal()->TOKEN_REF())) {
	constructed_block.element.emplace<kNonTerminal>(NonTerminal{node_name});
	} else if (atom_ctx->terminal()) {
	auto& terminal_node = constructed_block.element.emplace<kTerminal>();
	terminal_node.type = TerminalType::kStringLiteral;
	ChangeStringQuote(node_name);
	terminal_node.content = node_name;
	} else {
	FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
	}
	return constructed_block;
	}

	Block GrammarInfoBuilder::ConstructBlock(
	ANTLRv4Parser::ElementContext* element) {
	Block constructed_block;
	std::optional<std::string> suffix;
	if (auto atom = element->atom()) {
	constructed_block = ConstructBlock(atom);
	} else if (auto ebnf = element->ebnf()) {
	constructed_block = ConstructBlock(ebnf->block());
	if (ebnf->blockSuffix()) {
	suffix = ebnf->blockSuffix()->getText();
	}
	} else if (auto labeled_element = element->labeledElement()) {
	if (labeled_element->atom()) {
	constructed_block = ConstructBlock(labeled_element->atom());
	} else if (labeled_element->block()) {
	constructed_block = ConstructBlock(labeled_element->block());
	} else {
	FUZZTEST_INTERNAL_CHECK(false, "Impossible case!");
	}
	} else {
	FUZZTEST_INTERNAL_CHECK(false, "Unhandled case!");
	}

	if (element->ebnfSuffix()) {
	FUZZTEST_INTERNAL_CHECK(!suffix.has_value(),
	"It should have only one suffix.");
	suffix = element->ebnfSuffix()->getText();
	}

	if (suffix.has_value()) constructed_block.range = ParseRange(*suffix);
	return constructed_block;
	}

	ProductionRule GrammarInfoBuilder::ConstructProductionRule(
	ANTLRv4Parser::AlternativeContext* ctx) {
	ProductionRule prod_rule;
	for (auto element : ctx->element()) {
	prod_rule.blocks.push_back(ConstructBlock(element));
	}
	return prod_rule;
	}

	GrammarRule GrammarInfoBuilder::ConstructGrammarRule(
	ANTLRv4Parser::ParserRuleSpecContext* pctx) {
	GrammarRule grammar_rule;
	grammar_rule.symbol_name = pctx->RULE_REF()->getText();
	auto labeledAlt_vec = pctx->ruleBlock()->ruleAltList()->labeledAlt();
	for (auto labeled_alt : pctx->ruleBlock()->ruleAltList()->labeledAlt()) {
	auto ctx = labeled_alt->alternative();
	FUZZTEST_INTERNAL_CHECK(ctx != nullptr, "Unhandeled case!");
	grammar_rule.productions.production_rules.push_back(
	ConstructProductionRule(ctx));
	}
	return grammar_rule;
	}

	Grammar GrammarInfoBuilder::BuildGrammarInfo(
	const std::vector<std::string>& input_grammar_specs) {
	FUZZTEST_INTERNAL_CHECK_PRECONDITION(!input_grammar_specs.empty(),
	"No input files!");
	for (auto& input_grammar_spec : input_grammar_specs) {
	antlr4::ANTLRInputStream input(input_grammar_spec);
	antlr4_grammar::ANTLRv4Lexer lexer(&input);
	antlr4::CommonTokenStream tokens(&lexer);

	ANTLRv4Parser parser(&tokens);
	try {
	antlr4::tree::ParseTree* tree = parser.grammarSpec();
	antlr4::tree::ParseTreeWalker::DEFAULT.walk(this, tree);
	} catch (antlr4::ParseCancellationException) {
	FUZZTEST_INTERNAL_CHECK(false, "Cannot parse the grammar files!");
	} catch (...) {
	// The ParseCancellationException might miss some errors. So we need to
	// catch everything here.
	FUZZTEST_INTERNAL_CHECK(false, "Unknown errors!");
	}
	}
	FUZZTEST_INTERNAL_CHECK(!grammar_name_.empty() && !rules_.empty(),
	"Wrong grammar file!");
	return Grammar{std::move(grammar_name_), std::move(rules_)};
	}

	} // namespace fuzztest::internal::grammar