blob: a9abd6919c83f6926d5567c1729eb9b17a1fa759 [file] [log] [blame]
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string matching
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. Compile returns strings that need to be
// matched. Note that the returned strings are lowercased and distinct.
// For applying regexps to a search text, the caller does the string
// matching using the returned strings. When doing the string match,
// note that the caller has to do that in a case-insensitive way or
// on a lowercased version of the search text. Then call FirstMatch
// or AllMatches with a vector of indices of strings that were found
// in the text to get the actual regexp matches.
#include <memory>
#include <string>
#include <vector>
#include "absl/strings/string_view.h"
#include "re2/re2.h"
namespace re2 {
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
explicit FilteredRE2(int min_atom_len);
~FilteredRE2();
// Not copyable.
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
// Movable.
FilteredRE2(FilteredRE2&& other);
FilteredRE2& operator=(FilteredRE2&& other);
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(absl::string_view pattern,
const RE2::Options& options,
int* id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased and distinct. When doing
// string matching, it should be performed in a case-insensitive
// way or the search text should be lowercased first. Call after
// all Add calls are done.
void Compile(std::vector<std::string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(absl::string_view text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(absl::string_view text,
const std::vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(absl::string_view text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const;
// Returns the indices of all potentially matching regexps after first
// clearing potential_regexps.
// A regexp is potentially matching if it passes the filter.
// If a regexp passes the filter it may still not match.
// A regexp that does not pass the filter is guaranteed to not match.
void AllPotentials(const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const;
// The number of regexps added.
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
// Get the individual RE2 objects.
const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
private:
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
std::vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
std::unique_ptr<PrefilterTree> prefilter_tree_;
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_