Fix more ancient bugs around Latin-1 handling.
It turned out that case folding assumed UTF-8 mode, so
we would fold, say, 0xD1 to 0xF1 even in Latin-1 mode.
Fixes #477.
Change-Id: I73aa5c8e33ee0c6041c54e3a7268635915960f64
Reviewed-on: https://code-review.googlesource.com/c/re2/+/62714
Reviewed-by: Alex Chernyakhovsky <achernya@google.com>
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/parse.cc b/re2/parse.cc
index a027917..2558b2a 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -338,6 +338,20 @@
}
// Add lo-hi to the class, along with their fold-equivalent characters.
+static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) {
+ while (lo <= hi) {
+ cc->AddRange(lo, lo);
+ if ('A' <= lo && lo <= 'Z') {
+ cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a');
+ }
+ if ('a' <= lo && lo <= 'z') {
+ cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A');
+ }
+ lo++;
+ }
+}
+
+// Add lo-hi to the class, along with their fold-equivalent characters.
// If lo-hi is already in the class, assume that the fold-equivalent
// chars are there too, so there's no work to do.
static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
@@ -394,17 +408,26 @@
// Pushes the literal rune r onto the stack.
bool Regexp::ParseState::PushLiteral(Rune r) {
// Do case folding if needed.
- if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
- Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
- re->ccb_ = new CharClassBuilder;
- Rune r1 = r;
- do {
- if (!(flags_ & NeverNL) || r != '\n') {
- re->ccb_->AddRange(r, r);
- }
- r = CycleFoldRune(r);
- } while (r != r1);
- return PushRegexp(re);
+ if (flags_ & FoldCase) {
+ if (flags_ & Latin1 && (('A' <= r && r <= 'Z') ||
+ ('a' <= r && r <= 'z'))) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ AddFoldedRangeLatin1(re->ccb_, r, r);
+ return PushRegexp(re);
+ }
+ if (!(flags_ & Latin1) && CycleFoldRune(r) != r) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ Rune r1 = r;
+ do {
+ if (!(flags_ & NeverNL) || r != '\n') {
+ re->ccb_->AddRange(r, r);
+ }
+ r = CycleFoldRune(r);
+ } while (r != r1);
+ return PushRegexp(re);
+ }
}
// Exclude newline if applicable.
@@ -1176,7 +1199,7 @@
if (re->op() == kRegexpCharClass) {
CharClass* cc = re->cc();
for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
- ccb.AddRange(it->lo, it->hi);
+ ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags());
} else if (re->op() == kRegexpLiteral) {
if (re->parse_flags() & Regexp::FoldCase) {
// AddFoldedRange() can terminate prematurely if the character class
@@ -1195,7 +1218,7 @@
}
re->Decref();
}
- Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
+ Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase);
splices->emplace_back(re, sub + start, i - start);
}
@@ -1623,10 +1646,15 @@
}
// If folding case, add fold-equivalent characters too.
- if (parse_flags & Regexp::FoldCase)
- AddFoldedRange(this, lo, hi, 0);
- else
+ if (parse_flags & Regexp::FoldCase) {
+ if (parse_flags & Regexp::Latin1) {
+ AddFoldedRangeLatin1(this, lo, hi);
+ } else {
+ AddFoldedRange(this, lo, hi, 0);
+ }
+ } else {
AddRange(lo, hi);
+ }
}
// Look for a group with the given name.
diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
index 5cddd23..9e3c94a 100644
--- a/re2/testing/dump.cc
+++ b/re2/testing/dump.cc
@@ -96,17 +96,25 @@
break;
case kRegexpLiteral: {
Rune r = re->rune();
- char buf[UTFmax+1];
- buf[runetochar(buf, &r)] = 0;
- s->append(buf);
+ if (re->parse_flags() & Regexp::Latin1) {
+ s->push_back(r);
+ } else {
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ }
break;
}
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
- char buf[UTFmax+1];
- buf[runetochar(buf, &r)] = 0;
- s->append(buf);
+ if (re->parse_flags() & Regexp::Latin1) {
+ s->push_back(r);
+ } else {
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ }
}
break;
case kRegexpConcat:
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 7684b62..95294d5 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -225,6 +225,29 @@
// Bug in Regexp::ToString() that emitted [^], which
// would (obviously) fail to parse when fed back in.
{ "[\\s\\S]", "cc{0-0x10ffff}" },
+
+ // As per https://github.com/google/re2/issues/477,
+ // there were long-standing bugs involving Latin-1.
+ // Here, we exercise it WITHOUT case folding...
+ { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 },
+ { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 },
+ { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 },
+ { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 },
+ { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+ { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+ { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+ { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+ // Here, we exercise it WITH case folding...
+ // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2
+ // should fold to 0xF1 and 0xF2, respectively.
+ { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
};
bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
@@ -492,7 +515,7 @@
// << " t=" << t << " regexp=" << tests[i].regexp;
// Test that if we parse the new regexp we get the same structure.
- Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
+ Regexp* nre = Regexp::Parse(t, f, &status);
ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
std::string ss = nre->Dump();
std::string tt = nre->ToString();