blob: d0e5ba0ca09db4e1ed7af894aed108b73f13ad47 [file] [log] [blame]
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <fuzzer/FuzzedDataProvider.h>
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <string>
#include <vector>
#include "re2/filtered_re2.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/set.h"
#include "re2/walker-inl.h"
// NOT static, NOT signed.
uint8_t dummy = 0;
// Walks kRegexpConcat and kRegexpAlternate subexpressions
// to determine their maximum length.
class SubexpressionWalker : public re2::Regexp::Walker<int> {
public:
SubexpressionWalker() = default;
~SubexpressionWalker() override = default;
int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) override {
switch (re->op()) {
case re2::kRegexpConcat:
case re2::kRegexpAlternate: {
int max = nchild_args;
for (int i = 0; i < nchild_args; i++)
max = std::max(max, child_args[i]);
return max;
}
default:
break;
}
return -1;
}
// Should never be called: we use Walk(), not WalkExponential().
int ShortVisit(re2::Regexp* re, int parent_arg) override {
return parent_arg;
}
private:
SubexpressionWalker(const SubexpressionWalker&) = delete;
SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
};
// Walks substrings (i.e. kRegexpLiteralString subexpressions)
// to determine their maximum length... in runes, but avoiding
// overheads due to UTF-8 encoding is worthwhile when fuzzing.
class SubstringWalker : public re2::Regexp::Walker<int> {
public:
SubstringWalker() = default;
~SubstringWalker() override = default;
int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) override {
switch (re->op()) {
case re2::kRegexpConcat:
case re2::kRegexpAlternate:
case re2::kRegexpStar:
case re2::kRegexpPlus:
case re2::kRegexpQuest:
case re2::kRegexpRepeat:
case re2::kRegexpCapture: {
int max = -1;
for (int i = 0; i < nchild_args; i++)
max = std::max(max, child_args[i]);
return max;
}
case re2::kRegexpLiteralString:
return re->nrunes();
default:
break;
}
return -1;
}
// Should never be called: we use Walk(), not WalkExponential().
int ShortVisit(re2::Regexp* re, int parent_arg) override {
return parent_arg;
}
private:
SubstringWalker(const SubstringWalker&) = delete;
SubstringWalker& operator=(const SubstringWalker&) = delete;
};
void TestOneInput(absl::string_view pattern, const RE2::Options& options,
RE2::Anchor anchor, absl::string_view text) {
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
// Otherwise, we will waste time on inputs that have long runs of various
// character classes. The fuzzer has shown itself to be easily capable of
// generating such patterns that fall within the other limits, but result
// in timeouts nonetheless. The marginal cost is high - even more so when
// counted repetition is involved - whereas the marginal benefit is zero.
// Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
// three-element character classes when case-insensitive and using UTF-8.
// TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
int char_class = 0;
int backslash_p = 0; // very expensive, so handle specially
for (size_t i = 0; i < pattern.size(); i++) {
if (pattern[i] == '.' ||
pattern[i] == 'k' || pattern[i] == 'K' ||
pattern[i] == 's' || pattern[i] == 'S')
char_class++;
if (pattern[i] != '\\')
continue;
i++;
if (i >= pattern.size())
break;
if (pattern[i] == 'p' || pattern[i] == 'P' ||
pattern[i] == 'd' || pattern[i] == 'D' ||
pattern[i] == 's' || pattern[i] == 'S' ||
pattern[i] == 'w' || pattern[i] == 'W')
char_class++;
if (pattern[i] == 'p' || pattern[i] == 'P')
backslash_p++;
}
if (char_class > 9)
return;
if (backslash_p > 1)
return;
// Iterate just once when fuzzing. Otherwise, we easily get bogged down
// and coverage is unlikely to improve despite significant expense.
RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
// The default is 1000. Even 100 turned out to be too generous
// for fuzzing, empirically speaking, so let's try 10 instead.
re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
RE2 re(pattern, options);
if (!re.ok())
return;
// Don't waste time fuzzing programs with large subexpressions.
// They can cause bug reports due to fuzzer timeouts. And they
// aren't interesting for fuzzing purposes.
if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
return;
// Don't waste time fuzzing programs with large substrings.
// They can cause bug reports due to fuzzer timeouts when they
// are repetitions (e.g. hundreds of NUL bytes) and matching is
// unanchored. And they aren't interesting for fuzzing purposes.
if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
return;
// Don't waste time fuzzing high-size programs.
// They can cause bug reports due to fuzzer timeouts.
int size = re.ProgramSize();
if (size > 9999)
return;
int rsize = re.ReverseProgramSize();
if (rsize > 9999)
return;
// Don't waste time fuzzing high-fanout programs.
// They can cause bug reports due to fuzzer timeouts.
std::vector<int> histogram;
int fanout = re.ProgramFanout(&histogram);
if (fanout > 9)
return;
int rfanout = re.ReverseProgramFanout(&histogram);
if (rfanout > 9)
return;
if (re.NumberOfCapturingGroups() == 0) {
// Avoid early return due to too many arguments.
absl::string_view sp = text;
RE2::FullMatch(sp, re);
RE2::PartialMatch(sp, re);
RE2::Consume(&sp, re);
sp = text; // Reset.
RE2::FindAndConsume(&sp, re);
} else {
// Okay, we have at least one capturing group...
// Try conversion for variously typed arguments.
absl::string_view sp = text;
short s;
RE2::FullMatch(sp, re, &s);
long l;
RE2::PartialMatch(sp, re, &l);
float f;
RE2::Consume(&sp, re, &f);
sp = text; // Reset.
double d;
RE2::FindAndConsume(&sp, re, &d);
}
std::string s = std::string(text);
RE2::Replace(&s, re, "");
s = std::string(text); // Reset.
RE2::GlobalReplace(&s, re, "");
std::string min, max;
re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
// Exercise some other API functionality.
dummy += re.NamedCapturingGroups().size();
dummy += re.CapturingGroupNames().size();
dummy += RE2::QuoteMeta(pattern).size();
dummy += re.Regexp()->ToString().size();
RE2::Set set(options, anchor);
int index = set.Add(pattern, /*error=*/NULL); // -1 on error
if (index != -1 && set.Compile()) {
std::vector<int> matches;
set.Match(text, &matches);
}
re2::FilteredRE2 filter;
index = -1; // not clobbered on error
filter.Add(pattern, options, &index);
if (index != -1) {
std::vector<std::string> atoms;
filter.Compile(&atoms);
// Pretend that all atoms match, which
// triggers the AND-OR tree maximally.
std::vector<int> matched_atoms;
matched_atoms.reserve(atoms.size());
for (size_t i = 0; i < atoms.size(); ++i)
matched_atoms.push_back(static_cast<int>(i));
std::vector<int> matches;
filter.AllMatches(text, matched_atoms, &matches);
}
}
// Entry point for libFuzzer.
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
// An input larger than 4 KiB probably isn't interesting. (This limit
// allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
if (size == 0 || size > 4096)
return 0;
FuzzedDataProvider fdp(data, size);
// The convention here is that fdp.ConsumeBool() returning false sets
// the default value whereas returning true sets the alternate value:
// most options default to false and so can be set directly; encoding
// defaults to UTF-8; case_sensitive defaults to true. We do NOT want
// to log errors. max_mem is 64 MiB because we can afford to use more
// RAM in exchange for (hopefully) faster fuzzing.
RE2::Options options;
options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
: RE2::Options::EncodingUTF8);
options.set_posix_syntax(fdp.ConsumeBool());
options.set_longest_match(fdp.ConsumeBool());
options.set_log_errors(false);
options.set_max_mem(64 << 20);
options.set_literal(fdp.ConsumeBool());
options.set_never_nl(fdp.ConsumeBool());
options.set_dot_nl(fdp.ConsumeBool());
options.set_never_capture(fdp.ConsumeBool());
options.set_case_sensitive(!fdp.ConsumeBool());
options.set_perl_classes(fdp.ConsumeBool());
options.set_word_boundary(fdp.ConsumeBool());
options.set_one_line(fdp.ConsumeBool());
// ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
// kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
RE2::UNANCHORED,
RE2::ANCHOR_START,
RE2::ANCHOR_BOTH,
});
std::string pattern = fdp.ConsumeRandomLengthString(999);
std::string text = fdp.ConsumeRandomLengthString(999);
TestOneInput(pattern, options, anchor, text);
return 0;
}