Russ Cox | 0176cc7 | 2012-02-07 13:13:06 -0500 | [diff] [blame] | 1 | This is a dump from Google's source control system of the change |
| 2 | that removed UCS-2 support from RE2. As the explanation below |
| 3 | says, UCS-2 mode is fundamentally at odds with things like ^ and $, |
| 4 | so it never really worked very well. But if you are interested in using |
| 5 | it without those operators, it did work for that. It assumed that the |
| 6 | UCS-2 data was in the native host byte order. |
| 7 | |
| 8 | If you are interested in adding UCS-2 mode back, this patch might |
| 9 | be a good starting point. |
| 10 | |
| 11 | |
| 12 | Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15 |
| 13 | |
| 14 | Retire UCS-2 mode. |
| 15 | |
| 16 | I added it as an experiment for V8, but it |
| 17 | requires 2-byte lookahead to do completely, |
| 18 | and RE2 has 1-byte lookahead (enough for UTF-8) |
| 19 | as a fairly deep fundamental assumption, |
| 20 | so it did not support ^ or $. |
| 21 | |
| 22 | ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ==== |
| 23 | re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319 |
| 24 | cap_[0] = p; |
| 25 | if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. |
| 26 | return true; |
| 27 | - if (prog_->flags() & Regexp::UCS2) |
| 28 | - p++; |
| 29 | } |
| 30 | return false; |
| 31 | } |
| 32 | ==== re2/compile.cc#17 - re2/compile.cc#18 ==== |
| 33 | re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100 |
| 34 | // Input encodings. |
| 35 | enum Encoding { |
| 36 | kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) |
| 37 | - kEncodingUCS2, // UCS-2 (0-FFFF), native byte order |
| 38 | kEncodingLatin1, // Latin1 (0-FF) |
| 39 | }; |
| 40 | |
| 41 | re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172 |
| 42 | void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); |
| 43 | void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); |
| 44 | void Add_80_10ffff(); |
| 45 | - void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase); |
| 46 | - void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, |
| 47 | - uint8 lo2, uint8 hi2, bool fold2); |
| 48 | |
| 49 | // New suffix that matches the byte range lo-hi, then goes to next. |
| 50 | Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next); |
| 51 | re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477 |
| 52 | |
| 53 | // Converts rune range lo-hi into a fragment that recognizes |
| 54 | // the bytes that would make up those runes in the current |
| 55 | - // encoding (Latin 1, UTF-8, or UCS-2). |
| 56 | + // encoding (Latin 1 or UTF-8). |
| 57 | // This lets the machine work byte-by-byte even when |
| 58 | // using multibyte encodings. |
| 59 | |
| 60 | re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489 |
| 61 | case kEncodingLatin1: |
| 62 | AddRuneRangeLatin1(lo, hi, foldcase); |
| 63 | break; |
| 64 | - case kEncodingUCS2: |
| 65 | - AddRuneRangeUCS2(lo, hi, foldcase); |
| 66 | - break; |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501 |
| 71 | AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL)); |
| 72 | } |
| 73 | |
| 74 | - // Test whether 16-bit values are big or little endian. |
| 75 | - static bool BigEndian() { |
| 76 | - union { |
| 77 | - char byte[2]; |
| 78 | - int16 endian; |
| 79 | - } u; |
| 80 | - |
| 81 | - u.byte[0] = 1; |
| 82 | - u.byte[1] = 2; |
| 83 | - return u.endian == 0x0102; |
| 84 | - } |
| 85 | - |
| 86 | - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, |
| 87 | - uint8 lo2, uint8 hi2, bool fold2) { |
| 88 | - Inst* ip; |
| 89 | - if (reversed_) { |
| 90 | - ip = RuneByteSuffix(lo1, hi1, fold1, NULL); |
| 91 | - ip = RuneByteSuffix(lo2, hi2, fold2, ip); |
| 92 | - } else { |
| 93 | - ip = RuneByteSuffix(lo2, hi2, fold2, NULL); |
| 94 | - ip = RuneByteSuffix(lo1, hi1, fold1, ip); |
| 95 | - } |
| 96 | - AddSuffix(ip); |
| 97 | - } |
| 98 | - |
| 99 | - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) { |
| 100 | - if (lo > hi || lo > 0xFFFF) |
| 101 | - return; |
| 102 | - if (hi > 0xFFFF) |
| 103 | - hi = 0xFFFF; |
| 104 | - |
| 105 | - // We'll assemble a pattern assuming big endian. |
| 106 | - // If the machine isn't, tell Cat to reverse its arguments. |
| 107 | - bool oldreversed = reversed_; |
| 108 | - if (!BigEndian()) { |
| 109 | - reversed_ = !oldreversed; |
| 110 | - } |
| 111 | - |
| 112 | - // Split into bytes. |
| 113 | - int lo1 = lo >> 8; |
| 114 | - int lo2 = lo & 0xFF; |
| 115 | - int hi1 = hi >> 8; |
| 116 | - int hi2 = hi & 0xFF; |
| 117 | - |
| 118 | - if (lo1 == hi1) { |
| 119 | - // Easy case: high bits are same in both. |
| 120 | - // Only do ASCII case folding on the second byte if the top byte is 00. |
| 121 | - AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase); |
| 122 | - } else { |
| 123 | - // Harder case: different second byte ranges depending on first byte. |
| 124 | - |
| 125 | - // Initial fragment. |
| 126 | - if (lo2 > 0) { |
| 127 | - AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase); |
| 128 | - lo1++; |
| 129 | - } |
| 130 | - |
| 131 | - // Trailing fragment. |
| 132 | - if (hi2 < 0xFF) { |
| 133 | - AddUCS2Pair(hi1, hi1, false, 0, hi2, false); |
| 134 | - hi1--; |
| 135 | - } |
| 136 | - |
| 137 | - // Inner ranges. |
| 138 | - if (lo1 <= hi1) { |
| 139 | - AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false); |
| 140 | - } |
| 141 | - } |
| 142 | - |
| 143 | - // Restore reverse setting. |
| 144 | - reversed_ = oldreversed; |
| 145 | - } |
| 146 | - |
| 147 | // Table describing how to make a UTF-8 matching machine |
| 148 | // for the rune range 80-10FFFF (Runeself-Runemax). |
| 149 | // This range happens frequently enough (for example /./ and /[^a-z]/) |
| 150 | re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634 |
| 151 | |
| 152 | Frag Compiler::Literal(Rune r, bool foldcase) { |
| 153 | switch (encoding_) { |
| 154 | - default: // UCS-2 or something new |
| 155 | - BeginRange(); |
| 156 | - AddRuneRange(r, r, foldcase); |
| 157 | - return EndRange(); |
| 158 | + default: |
| 159 | + return kNullFrag; |
| 160 | |
| 161 | case kEncodingLatin1: |
| 162 | return ByteRange(r, r, foldcase); |
| 163 | re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850 |
| 164 | |
| 165 | if (re->parse_flags() & Regexp::Latin1) |
| 166 | c.encoding_ = kEncodingLatin1; |
| 167 | - else if (re->parse_flags() & Regexp::UCS2) |
| 168 | - c.encoding_ = kEncodingUCS2; |
| 169 | c.reversed_ = reversed; |
| 170 | if (max_mem <= 0) { |
| 171 | c.max_inst_ = 100000; // more than enough |
| 172 | re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905 |
| 173 | c.prog_->set_start_unanchored(c.prog_->start()); |
| 174 | } else { |
| 175 | Frag dot; |
| 176 | - if (c.encoding_ == kEncodingUCS2) { |
| 177 | - dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false)); |
| 178 | - } else { |
| 179 | - dot = c.ByteRange(0x00, 0xFF, false); |
| 180 | - } |
| 181 | + dot = c.ByteRange(0x00, 0xFF, false); |
| 182 | Frag dotloop = c.Star(dot, true); |
| 183 | Frag unanchored = c.Cat(dotloop, all); |
| 184 | c.prog_->set_start_unanchored(unanchored.begin); |
| 185 | ==== re2/nfa.cc#8 - re2/nfa.cc#9 ==== |
| 186 | re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431 |
| 187 | const char* bp = context.begin(); |
| 188 | int c = -1; |
| 189 | int wasword = 0; |
| 190 | - bool ucs2 = prog_->flags() & Regexp::UCS2; |
| 191 | |
| 192 | if (text.begin() > context.begin()) { |
| 193 | c = text.begin()[-1] & 0xFF; |
| 194 | re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497 |
| 195 | // If there's a required first byte for an unanchored search |
| 196 | // and we're not in the middle of any possible matches, |
| 197 | // use memchr to search for the byte quickly. |
| 198 | - if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 && |
| 199 | + if (!anchored && first_byte_ >= 0 && runq->size() == 0 && |
| 200 | p < text.end() && (p[0] & 0xFF) != first_byte_) { |
| 201 | p = reinterpret_cast<const char*>(memchr(p, first_byte_, |
| 202 | text.end() - p)); |
| 203 | re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514 |
| 204 | flag = Prog::EmptyFlags(context, p); |
| 205 | } |
| 206 | |
| 207 | - // In UCS-2 mode, if we need to start a new thread, |
| 208 | - // make sure to do it on an even boundary. |
| 209 | - if(ucs2 && runq->size() == 0 && |
| 210 | - (p - context.begin()) % 2 && p < text.end()) { |
| 211 | - p++; |
| 212 | - flag = Prog::EmptyFlags(context, p); |
| 213 | - } |
| 214 | - |
| 215 | // Steal match storage (cleared but unused as of yet) |
| 216 | // temporarily to hold match boundaries for new thread. |
| 217 | - // In UCS-2 mode, only start the thread on a 2-byte boundary. |
| 218 | - if(!ucs2 || (p - context.begin()) % 2 == 0) { |
| 219 | - match_[0] = p; |
| 220 | - AddToThreadq(runq, start_, flag, p, match_); |
| 221 | - match_[0] = NULL; |
| 222 | - } |
| 223 | + match_[0] = p; |
| 224 | + AddToThreadq(runq, start_, flag, p, match_); |
| 225 | + match_[0] = NULL; |
| 226 | } |
| 227 | |
| 228 | // If all the threads have died, stop early. |
| 229 | ==== re2/parse.cc#22 - re2/parse.cc#23 ==== |
| 230 | re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165 |
| 231 | status_(status), stacktop_(NULL), ncap_(0) { |
| 232 | if (flags_ & Latin1) |
| 233 | rune_max_ = 0xFF; |
| 234 | - else if (flags & UCS2) |
| 235 | - rune_max_ = 0xFFFF; |
| 236 | else |
| 237 | rune_max_ = Runemax; |
| 238 | } |
| 239 | re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374 |
| 240 | bool Regexp::ParseState::PushCarat() { |
| 241 | if (flags_ & OneLine) { |
| 242 | return PushSimpleOp(kRegexpBeginText); |
| 243 | - } else { |
| 244 | - if (flags_ & UCS2) { |
| 245 | - status_->set_code(kRegexpUnsupported); |
| 246 | - status_->set_error_arg("multiline ^ in UCS-2 mode"); |
| 247 | - return false; |
| 248 | - } |
| 249 | - return PushSimpleOp(kRegexpBeginLine); |
| 250 | } |
| 251 | + return PushSimpleOp(kRegexpBeginLine); |
| 252 | } |
| 253 | |
| 254 | // Pushes a \b or \B onto the stack. |
| 255 | bool Regexp::ParseState::PushWordBoundary(bool word) { |
| 256 | - if (flags_ & UCS2) { |
| 257 | - status_->set_code(kRegexpUnsupported); |
| 258 | - status_->set_error_arg("\\b or \\B in UCS-2 mode"); |
| 259 | - return false; |
| 260 | - } |
| 261 | if (word) |
| 262 | return PushSimpleOp(kRegexpWordBoundary); |
| 263 | return PushSimpleOp(kRegexpNoWordBoundary); |
| 264 | re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389 |
| 265 | bool ret = PushSimpleOp(kRegexpEndText); |
| 266 | flags_ = oflags; |
| 267 | return ret; |
| 268 | - } |
| 269 | - if (flags_ & UCS2) { |
| 270 | - status_->set_code(kRegexpUnsupported); |
| 271 | - status_->set_error_arg("multiline $ in UCS-2 mode"); |
| 272 | - return false; |
| 273 | } |
| 274 | return PushSimpleOp(kRegexpEndLine); |
| 275 | } |
| 276 | ==== re2/re2.cc#34 - re2/re2.cc#35 ==== |
| 277 | re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84 |
| 278 | return RE2::ErrorBadUTF8; |
| 279 | case re2::kRegexpBadNamedCapture: |
| 280 | return RE2::ErrorBadNamedCapture; |
| 281 | - case re2::kRegexpUnsupported: |
| 282 | - return RE2::ErrorUnsupported; |
| 283 | } |
| 284 | return RE2::ErrorInternal; |
| 285 | } |
| 286 | re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125 |
| 287 | break; |
| 288 | case RE2::Options::EncodingLatin1: |
| 289 | flags |= Regexp::Latin1; |
| 290 | - break; |
| 291 | - case RE2::Options::EncodingUCS2: |
| 292 | - flags |= Regexp::UCS2; |
| 293 | break; |
| 294 | } |
| 295 | |
| 296 | ==== re2/re2.h#36 - re2/re2.h#37 ==== |
| 297 | re2/re2.h#36:246,252 - re2/re2.h#37:246,251 |
| 298 | ErrorBadUTF8, // invalid UTF-8 in regexp |
| 299 | ErrorBadNamedCapture, // bad named capture group |
| 300 | ErrorPatternTooLarge, // pattern too large (compile failed) |
| 301 | - ErrorUnsupported, // unsupported feature (in UCS-2 mode) |
| 302 | }; |
| 303 | |
| 304 | // Predefined common options. |
| 305 | re2/re2.h#36:570,576 - re2/re2.h#37:569,574 |
| 306 | |
| 307 | enum Encoding { |
| 308 | EncodingUTF8 = 1, |
| 309 | - EncodingUCS2, // 16-bit Unicode 0-FFFF only |
| 310 | EncodingLatin1 |
| 311 | }; |
| 312 | |
| 313 | ==== re2/regexp.cc#15 - re2/regexp.cc#16 ==== |
| 314 | re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329 |
| 315 | // the regexp that remains after the prefix. The prefix might |
| 316 | // be ASCII case-insensitive. |
| 317 | bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { |
| 318 | - // Don't even bother for UCS-2; it's time to throw that code away. |
| 319 | - if (parse_flags_ & UCS2) |
| 320 | - return false; |
| 321 | - |
| 322 | // No need for a walker: the regexp must be of the form |
| 323 | // 1. some number of ^ anchors |
| 324 | // 2. a literal char or string |
| 325 | ==== re2/regexp.h#20 - re2/regexp.h#21 ==== |
| 326 | re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192 |
| 327 | kRegexpBadPerlOp, // bad perl operator |
| 328 | kRegexpBadUTF8, // invalid UTF-8 in regexp |
| 329 | kRegexpBadNamedCapture, // bad named capture |
| 330 | - kRegexpUnsupported, // unsupported operator |
| 331 | }; |
| 332 | |
| 333 | // Error status for certain operations. |
| 334 | re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314 |
| 335 | // \Q and \E to disable/enable metacharacters |
| 336 | // (?P<name>expr) for named captures |
| 337 | // \C to match any single byte |
| 338 | - UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8. |
| 339 | - UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group |
| 340 | + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group |
| 341 | // and \P{Han} for its negation. |
| 342 | - NeverNL = 1<<12, // Never match NL, even if the regexp mentions |
| 343 | + NeverNL = 1<<11, // Never match NL, even if the regexp mentions |
| 344 | // it explicitly. |
| 345 | |
| 346 | // As close to Perl as we can get. |
| 347 | ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ==== |
| 348 | re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139 |
| 349 | cap_[0] = p; |
| 350 | if (Visit(prog_->start(), p)) // Match must be leftmost; done. |
| 351 | return true; |
| 352 | - if (prog_->flags() & Regexp::UCS2) |
| 353 | - p++; |
| 354 | } |
| 355 | return false; |
| 356 | } |
| 357 | ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ==== |
| 358 | re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152 |
| 359 | static ParseMode parse_modes[] = { |
| 360 | { single_line, "single-line" }, |
| 361 | { single_line|Regexp::Latin1, "single-line, latin1" }, |
| 362 | - { single_line|Regexp::UCS2, "single-line, ucs2" }, |
| 363 | { multi_line, "multiline" }, |
| 364 | { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, |
| 365 | { multi_line|Regexp::Latin1, "multiline, latin1" }, |
| 366 | - { multi_line|Regexp::UCS2, "multiline, ucs2" }, |
| 367 | }; |
| 368 | |
| 369 | static string FormatMode(Regexp::ParseFlags flags) { |
| 370 | re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185 |
| 371 | RegexpStatus status; |
| 372 | regexp_ = Regexp::Parse(regexp_str, flags, &status); |
| 373 | if (regexp_ == NULL) { |
| 374 | - if (status.code() != kRegexpUnsupported) { |
| 375 | - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) |
| 376 | - << " mode: " << FormatMode(flags); |
| 377 | - error_ = true; |
| 378 | - } |
| 379 | + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) |
| 380 | + << " mode: " << FormatMode(flags); |
| 381 | + error_ = true; |
| 382 | return; |
| 383 | } |
| 384 | prog_ = regexp_->CompileToProg(0); |
| 385 | re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231 |
| 386 | RE2::Options options; |
| 387 | if (flags & Regexp::Latin1) |
| 388 | options.set_encoding(RE2::Options::EncodingLatin1); |
| 389 | - else if (flags & Regexp::UCS2) |
| 390 | - options.set_encoding(RE2::Options::EncodingUCS2); |
| 391 | if (kind_ == Prog::kLongestMatch) |
| 392 | options.set_longest_match(true); |
| 393 | re2_ = new RE2(re, options); |
| 394 | re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280 |
| 395 | delete re2_; |
| 396 | } |
| 397 | |
| 398 | - // Converts UTF-8 string in text into UCS-2 string in new_text. |
| 399 | - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) { |
| 400 | - const char* p = text.begin(); |
| 401 | - const char* ep = text.end(); |
| 402 | - uint16* q = new uint16[ep - p]; |
| 403 | - uint16* q0 = q; |
| 404 | - |
| 405 | - int n; |
| 406 | - Rune r; |
| 407 | - for (; p < ep; p += n) { |
| 408 | - if (!fullrune(p, ep - p)) { |
| 409 | - delete[] q0; |
| 410 | - return false; |
| 411 | - } |
| 412 | - n = chartorune(&r, p); |
| 413 | - if (r > 0xFFFF) { |
| 414 | - delete[] q0; |
| 415 | - return false; |
| 416 | - } |
| 417 | - *q++ = r; |
| 418 | - } |
| 419 | - *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0)); |
| 420 | - return true; |
| 421 | - } |
| 422 | - |
| 423 | - // Rewrites *sp from being a pointer into text8 (UTF-8) |
| 424 | - // to being a pointer into text16 (equivalent text but in UCS-2). |
| 425 | - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16, |
| 426 | - StringPiece *sp) { |
| 427 | - if (sp->begin() == NULL && text8.begin() != NULL) |
| 428 | - return; |
| 429 | - |
| 430 | - int nrune = 0; |
| 431 | - int n; |
| 432 | - Rune r; |
| 433 | - const char* p = text8.begin(); |
| 434 | - const char* ep = text8.end(); |
| 435 | - const char* spbegin = NULL; |
| 436 | - const char* spend = NULL; |
| 437 | - for (;;) { |
| 438 | - if (p == sp->begin()) |
| 439 | - spbegin = text16.begin() + sizeof(uint16)*nrune; |
| 440 | - if (p == sp->end()) |
| 441 | - spend = text16.begin() + sizeof(uint16)*nrune; |
| 442 | - if (p >= ep) |
| 443 | - break; |
| 444 | - n = chartorune(&r, p); |
| 445 | - p += n; |
| 446 | - nrune++; |
| 447 | - } |
| 448 | - if (spbegin == NULL || spend == NULL) { |
| 449 | - LOG(FATAL) << "Error in AdjustUTF8ToUCS2 " |
| 450 | - << CEscape(text8) << " " |
| 451 | - << (int)(sp->begin() - text8.begin()) << " " |
| 452 | - << (int)(sp->end() - text8.begin()); |
| 453 | - } |
| 454 | - *sp = StringPiece(spbegin, spend - spbegin); |
| 455 | - } |
| 456 | - |
| 457 | - // Rewrites *sp from begin a pointer into text16 (UCS-2) |
| 458 | - // to being a pointer into text8 (equivalent text but in UTF-8). |
| 459 | - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8, |
| 460 | - StringPiece* sp) { |
| 461 | - if (sp->begin() == NULL) |
| 462 | - return; |
| 463 | - |
| 464 | - int nrune = 0; |
| 465 | - int n; |
| 466 | - Rune r; |
| 467 | - const char* p = text8.begin(); |
| 468 | - const char* ep = text8.end(); |
| 469 | - const char* spbegin = NULL; |
| 470 | - const char* spend = NULL; |
| 471 | - for (;;) { |
| 472 | - if (nrune == (sp->begin() - text16.begin())/2) |
| 473 | - spbegin = p; |
| 474 | - if (nrune == (sp->end() - text16.begin())/2) |
| 475 | - spend = p; |
| 476 | - if (p >= ep) |
| 477 | - break; |
| 478 | - n = chartorune(&r, p); |
| 479 | - p += n; |
| 480 | - nrune++; |
| 481 | - } |
| 482 | - if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) { |
| 483 | - LOG(FATAL) << "Error in AdjustUCS2ToUTF8 " |
| 484 | - << CEscape(text16) << " " |
| 485 | - << (int)(sp->begin() - text16.begin()) << " " |
| 486 | - << (int)(sp->end() - text16.begin()); |
| 487 | - } |
| 488 | - *sp = StringPiece(spbegin, spend - spbegin); |
| 489 | - } |
| 490 | - |
| 491 | // Runs a single search using the named engine type. |
| 492 | // This interface hides all the irregularities of the various |
| 493 | // engine interfaces from the rest of this file. |
| 494 | re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300 |
| 495 | |
| 496 | StringPiece text = orig_text; |
| 497 | StringPiece context = orig_context; |
| 498 | - bool ucs2 = false; |
| 499 | |
| 500 | - if ((flags() & Regexp::UCS2) && type != kEnginePCRE) { |
| 501 | - if (!ConvertUTF8ToUCS2(orig_context, &context)) { |
| 502 | - result->skipped = true; |
| 503 | - return; |
| 504 | - } |
| 505 | - |
| 506 | - // Rewrite context to refer to new text. |
| 507 | - AdjustUTF8ToUCS2(orig_context, context, &text); |
| 508 | - ucs2 = true; |
| 509 | - } |
| 510 | - |
| 511 | switch (type) { |
| 512 | default: |
| 513 | LOG(FATAL) << "Bad RunSearch type: " << (int)type; |
| 514 | re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451 |
| 515 | } |
| 516 | } |
| 517 | |
| 518 | - // If we did UCS-2 matching, rewrite the matches to refer |
| 519 | - // to the original UTF-8 text. |
| 520 | - if (ucs2) { |
| 521 | - if (result->matched) { |
| 522 | - if (result->have_submatch0) { |
| 523 | - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]); |
| 524 | - } else if (result->have_submatch) { |
| 525 | - for (int i = 0; i < nsubmatch; i++) { |
| 526 | - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]); |
| 527 | - } |
| 528 | - } |
| 529 | - } |
| 530 | - delete[] context.begin(); |
| 531 | - } |
| 532 | - |
| 533 | if (!result->matched) |
| 534 | memset(result->submatch, 0, sizeof result->submatch); |
| 535 | } |
| 536 | re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475 |
| 537 | return true; |
| 538 | } |
| 539 | |
| 540 | - // Check whether text uses only Unicode points <= 0xFFFF |
| 541 | - // (in the BMP). |
| 542 | - static bool IsBMP(const StringPiece& text) { |
| 543 | - const char* p = text.begin(); |
| 544 | - const char* ep = text.end(); |
| 545 | - while (p < ep) { |
| 546 | - if (!fullrune(p, ep - p)) |
| 547 | - return false; |
| 548 | - Rune r; |
| 549 | - p += chartorune(&r, p); |
| 550 | - if (r > 0xFFFF) |
| 551 | - return false; |
| 552 | - } |
| 553 | - return true; |
| 554 | - } |
| 555 | - |
| 556 | // Runs a single test. |
| 557 | bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, |
| 558 | Prog::Anchor anchor) { |
| 559 | re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483 |
| 560 | Result correct; |
| 561 | RunSearch(kEngineBacktrack, text, context, anchor, &correct); |
| 562 | if (correct.skipped) { |
| 563 | - if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode |
| 564 | + if (regexp_ == NULL) |
| 565 | return true; |
| 566 | LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) |
| 567 | << " " << FormatMode(flags_); |