Fix more ancient bugs around Latin-1 handling. It turned out that case folding assumed UTF-8 mode, so we would fold, say, 0xD1 to 0xF1 even in Latin-1 mode. Fixes #477. Change-Id: I73aa5c8e33ee0c6041c54e3a7268635915960f64 Reviewed-on: https://code-review.googlesource.com/c/re2/+/62714 Reviewed-by: Alex Chernyakhovsky <achernya@google.com> Reviewed-by: Paul Wankadia <junyer@google.com>

commit: f9550c3f7207f946a45bbccd1814b12b136aae72 [log] [tgz]
author: Paul Wankadia <junyer@google.com> Mon Feb 19 16:23:29 2024 +0000
committer: Paul Wankadia <junyer@google.com> Mon Feb 19 16:37:14 2024 +0000
tree: 25b11b258313b375466393629a2ca5aecd46f20e
parent: 0ff0fabc78d27337d11eb90744c13e5f9a2a41bb [diff]
diff --git a/re2/parse.cc b/re2/parse.cc
index a027917..2558b2a 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc

@@ -338,6 +338,20 @@
 }
 
 // Add lo-hi to the class, along with their fold-equivalent characters.
+static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) {
+  while (lo <= hi) {
+    cc->AddRange(lo, lo);
+    if ('A' <= lo && lo <= 'Z') {
+      cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a');
+    }
+    if ('a' <= lo && lo <= 'z') {
+      cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A');
+    }
+    lo++;
+  }
+}
+
+// Add lo-hi to the class, along with their fold-equivalent characters.
 // If lo-hi is already in the class, assume that the fold-equivalent
 // chars are there too, so there's no work to do.
 static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
@@ -394,17 +408,26 @@
 // Pushes the literal rune r onto the stack.
 bool Regexp::ParseState::PushLiteral(Rune r) {
   // Do case folding if needed.
-  if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
-    Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
-    re->ccb_ = new CharClassBuilder;
-    Rune r1 = r;
-    do {
-      if (!(flags_ & NeverNL) || r != '\n') {
-        re->ccb_->AddRange(r, r);
-      }
-      r = CycleFoldRune(r);
-    } while (r != r1);
-    return PushRegexp(re);
+  if (flags_ & FoldCase) {
+    if (flags_ & Latin1 && (('A' <= r && r <= 'Z') ||
+                            ('a' <= r && r <= 'z'))) {
+      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+      re->ccb_ = new CharClassBuilder;
+      AddFoldedRangeLatin1(re->ccb_, r, r);
+      return PushRegexp(re);
+    }
+    if (!(flags_ & Latin1) && CycleFoldRune(r) != r) {
+      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+      re->ccb_ = new CharClassBuilder;
+      Rune r1 = r;
+      do {
+        if (!(flags_ & NeverNL) || r != '\n') {
+          re->ccb_->AddRange(r, r);
+        }
+        r = CycleFoldRune(r);
+      } while (r != r1);
+      return PushRegexp(re);
+    }
   }
 
   // Exclude newline if applicable.
@@ -1176,7 +1199,7 @@
         if (re->op() == kRegexpCharClass) {
           CharClass* cc = re->cc();
           for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
-            ccb.AddRange(it->lo, it->hi);
+            ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags());
         } else if (re->op() == kRegexpLiteral) {
           if (re->parse_flags() & Regexp::FoldCase) {
             // AddFoldedRange() can terminate prematurely if the character class
@@ -1195,7 +1218,7 @@
         }
         re->Decref();
       }
-      Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
+      Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase);
       splices->emplace_back(re, sub + start, i - start);
     }
 
@@ -1623,10 +1646,15 @@
   }
 
   // If folding case, add fold-equivalent characters too.
-  if (parse_flags & Regexp::FoldCase)
-    AddFoldedRange(this, lo, hi, 0);
-  else
+  if (parse_flags & Regexp::FoldCase) {
+    if (parse_flags & Regexp::Latin1) {
+      AddFoldedRangeLatin1(this, lo, hi);
+    } else {
+      AddFoldedRange(this, lo, hi, 0);
+    }
+  } else {
     AddRange(lo, hi);
+  }
 }
 
 // Look for a group with the given name.

diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
index 5cddd23..9e3c94a 100644
--- a/re2/testing/dump.cc
+++ b/re2/testing/dump.cc

@@ -96,17 +96,25 @@
       break;
     case kRegexpLiteral: {
       Rune r = re->rune();
-      char buf[UTFmax+1];
-      buf[runetochar(buf, &r)] = 0;
-      s->append(buf);
+      if (re->parse_flags() & Regexp::Latin1) {
+        s->push_back(r);
+      } else {
+        char buf[UTFmax+1];
+        buf[runetochar(buf, &r)] = 0;
+        s->append(buf);
+      }
       break;
     }
     case kRegexpLiteralString:
       for (int i = 0; i < re->nrunes(); i++) {
         Rune r = re->runes()[i];
-        char buf[UTFmax+1];
-        buf[runetochar(buf, &r)] = 0;
-        s->append(buf);
+        if (re->parse_flags() & Regexp::Latin1) {
+          s->push_back(r);
+        } else {
+          char buf[UTFmax+1];
+          buf[runetochar(buf, &r)] = 0;
+          s->append(buf);
+        }
       }
       break;
     case kRegexpConcat:

diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 7684b62..95294d5 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc

@@ -225,6 +225,29 @@
   // Bug in Regexp::ToString() that emitted [^], which
   // would (obviously) fail to parse when fed back in.
   { "[\\s\\S]", "cc{0-0x10ffff}" },
+
+  // As per https://github.com/google/re2/issues/477,
+  // there were long-standing bugs involving Latin-1.
+  // Here, we exercise it WITHOUT case folding...
+  { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 },
+  { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 },
+  { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 },
+  { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 },
+  { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+  { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+  { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+  { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+  // Here, we exercise it WITH case folding...
+  // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2
+  // should fold to 0xF1 and 0xF2, respectively.
+  { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
 };
 
 bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
@@ -492,7 +515,7 @@
       //     << " t=" << t << " regexp=" << tests[i].regexp;
 
       // Test that if we parse the new regexp we get the same structure.
-      Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
+      Regexp* nre = Regexp::Parse(t, f, &status);
       ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
       std::string ss = nre->Dump();
       std::string tt = nre->ToString();
commit	f9550c3f7207f946a45bbccd1814b12b136aae72	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Mon Feb 19 16:23:29 2024 +0000
committer	Paul Wankadia <junyer@google.com>	Mon Feb 19 16:37:14 2024 +0000
tree	25b11b258313b375466393629a2ca5aecd46f20e
parent	0ff0fabc78d27337d11eb90744c13e5f9a2a41bb [diff]