[K/N][stdlib] Support \V, \v, \H, \h, \R in regex engine Issue #KT-50742 Fixed

commit: f7468cf9bcf03035ce7427e74b37597a595c89dc [log] [tgz]
author: Ilya Matveev <ilya.matveev@jetbrains.com> Thu Feb 17 16:26:45 2022 +0700
committer: Ilya Matveev <ilya.matveev@jetbrains.com> Mon Feb 21 13:20:23 2022 +0700
tree: 8e98cd6e605be97ead1cd6e0c725aa33df086bc8
parent: 798a15c4509f3a25c9a32712d9f1a1a1761b9375 [diff]
diff --git a/libraries/stdlib/native-wasm/src/kotlin/text/regex/AbstractCharClass.kt b/libraries/stdlib/native-wasm/src/kotlin/text/regex/AbstractCharClass.kt
index 090a5e4..f6dbb42 100644
--- a/libraries/stdlib/native-wasm/src/kotlin/text/regex/AbstractCharClass.kt
+++ b/libraries/stdlib/native-wasm/src/kotlin/text/regex/AbstractCharClass.kt

@@ -300,6 +300,46 @@
         override fun computeValue(): AbstractCharClass = CharClass().add('0', '9').add('a', 'f').add('A', 'F')
     }
 
+    // From Java 8+ `Pattern` doc: \v - A vertical whitespace character: [\n\x0B\f\r\x85\u2028\u2029]
+    internal class CachedVerticalWhitespace : CachedCharClass() {
+        init {
+            initValues()
+        }
+        override fun computeValue(): AbstractCharClass =
+            CharClass().addAll(listOf('\n', '\u000B', '\u000C' /* aka \f */, '\r', '\u0085', '\u2028', '\u2029'))
+    }
+
+    // From Java 8+ `Pattern` doc: \V - A non-vertical whitespace character: [^\v]
+    internal class CachedNonVerticalWhitespace: CachedCharClass() {
+        init {
+            initValues()
+        }
+        override fun computeValue(): AbstractCharClass =
+            CachedVerticalWhitespace().getValue(negative = true).apply { mayContainSupplCodepoints = true }
+        // TODO: Does it match a pair of surrogates? Do we really need mayContainSupplCodepoints?
+    }
+
+    // From Java 8+ `Pattern` doc:
+    // \h - A horizontal whitespace character: [ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]
+    internal class CachedHorizontalWhitespace: CachedCharClass() {
+        init {
+            initValues()
+        }
+        override fun computeValue(): AbstractCharClass =
+            CharClass().addAll(listOf(' ', '\t', '\u00A0', '\u1680', '\u180e', '\u202f', '\u205f', '\u3000'))
+                .add('\u2000', '\u200a')
+    }
+
+    // From Java 8+ `Pattern` doc:
+    // \H - A non-horizontal whitespace character: [^\h]
+    internal class CachedNonHorizontalWhitespace: CachedCharClass() {
+        init {
+            initValues()
+        }
+        override fun computeValue(): AbstractCharClass =
+            CachedHorizontalWhitespace().getValue(negative = true).apply { mayContainSupplCodepoints = true }
+    }
+
     internal class CachedRange(var start: Int, var end: Int) : CachedCharClass() {
         init {
             initValues()
@@ -392,6 +432,10 @@
             NON_SPACE("S", ::CachedNonSpace),
             DIGIT_SHORT("d", ::CachedDigit),
             NON_DIGIT("D", ::CachedNonDigit),
+            VERTICAL_WHITESPACE("v", ::CachedVerticalWhitespace),
+            NON_VERTICAL_WHITESPACE("V", ::CachedNonVerticalWhitespace),
+            HORIZONTAL_WHITESPACE("h", ::CachedHorizontalWhitespace),
+            NON_HORIZONTAL_WHITESPACE("H", ::CachedNonHorizontalWhitespace),
             BASIC_LATIN("BasicLatin", { CachedRange(0x0000, 0x007F) }),
             LATIN1_SUPPLEMENT("Latin-1Supplement", { CachedRange(0x0080, 0x00FF) }),
             LATIN_EXTENDED_A("LatinExtended-A", { CachedRange(0x0100, 0x017F) }),

diff --git a/libraries/stdlib/native-wasm/src/kotlin/text/regex/CharClass.kt b/libraries/stdlib/native-wasm/src/kotlin/text/regex/CharClass.kt
index 9333d94..5a65d85 100644
--- a/libraries/stdlib/native-wasm/src/kotlin/text/regex/CharClass.kt
+++ b/libraries/stdlib/native-wasm/src/kotlin/text/regex/CharClass.kt

@@ -244,6 +244,16 @@
 
     fun add(start: Char, end: Char): CharClass = add(start.toInt(), end.toInt())
 
+    fun addAll(chars: Iterable<Char>): CharClass {
+        chars.forEach { add(it) }
+        return this
+    }
+
+    fun addAll(chars: Iterable<Int>): CharClass {
+        chars.forEach { add(it) }
+        return this
+    }
+
     // OR operation
     fun union(another: AbstractCharClass) {
         if (!mayContainSupplCodepoints && another.mayContainSupplCodepoints) {

diff --git a/libraries/stdlib/native-wasm/src/kotlin/text/regex/Lexer.kt b/libraries/stdlib/native-wasm/src/kotlin/text/regex/Lexer.kt
index eb2465a..a0055e1 100644
--- a/libraries/stdlib/native-wasm/src/kotlin/text/regex/Lexer.kt
+++ b/libraries/stdlib/native-wasm/src/kotlin/text/regex/Lexer.kt

@@ -435,7 +435,7 @@
             }
 
             // Word/whitespace/digit.
-            'w', 's', 'd', 'W', 'S', 'D' -> {
+            'w', 's', 'd', 'W', 'S', 'D', 'v', 'V', 'h', 'H' -> {
                 lookAheadSpecialToken = AbstractCharClass.getPredefinedClass(
                         pattern.concatToString(prevNonWhitespaceIndex, prevNonWhitespaceIndex + 1),
                         false
@@ -477,6 +477,7 @@
             'G' -> lookAhead = CHAR_PREVIOUS_MATCH
             'Z' -> lookAhead = CHAR_END_OF_LINE
             'z' -> lookAhead = CHAR_END_OF_INPUT
+            'R' -> lookAhead = CHAR_LINEBREAK
 
             // \cx - A control character corresponding to x.
             'c' -> {
@@ -488,7 +489,7 @@
                 }
             }
 
-            'C', 'E', 'F', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'R', 'T', 'U', 'V', 'X', 'Y', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'o', 'q', 'y' ->
+            'C', 'E', 'F', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'T', 'U', 'X', 'Y', 'g', 'i', 'j', 'k', 'l', 'm', 'o', 'q', 'y' ->
                 throw PatternSyntaxException("Illegal escape sequence", patternString, curTokenIndex)
         }
         return false
@@ -704,6 +705,7 @@
         val CHAR_PREVIOUS_MATCH       = 0x80000000.toInt() or 'G'.toInt()
         val CHAR_END_OF_INPUT         = 0x80000000.toInt() or 'z'.toInt()
         val CHAR_END_OF_LINE          = 0x80000000.toInt() or 'Z'.toInt()
+        val CHAR_LINEBREAK            = 0x80000000.toInt() or 'R'.toInt()
 
         // Quantifier modes.
         val QMOD_GREEDY     = 0xe0000000.toInt()

diff --git a/libraries/stdlib/native-wasm/src/kotlin/text/regex/Pattern.kt b/libraries/stdlib/native-wasm/src/kotlin/text/regex/Pattern.kt
index 9eaad27..01bdf00 100644
--- a/libraries/stdlib/native-wasm/src/kotlin/text/regex/Pattern.kt
+++ b/libraries/stdlib/native-wasm/src/kotlin/text/regex/Pattern.kt

@@ -533,11 +533,27 @@
                     term = EOLSet(consumersCount++, AbstractLineTerminator.getInstance(flags))
                 }
 
-                Lexer.CHAR_START_OF_INPUT -> {  // Start if an input: \A
+                Lexer.CHAR_START_OF_INPUT -> {  // Start of an input: \A
                     lexemes.next()
                     term = SOLSet(AbstractLineTerminator.getInstance(flags))
                 }
 
+                Lexer.CHAR_LINEBREAK -> {
+                    // Any unicode linebreak sequence:
+                    // \u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]
+                    lexemes.next()
+                    val fSet = NonCapFSet(consumersCount++)
+                    val lineBreakSequence = SequenceSet("\u000D\u000A").apply {
+                        next = fSet
+                    }
+                    val lineBreakChars = RangeSet(
+                        CharClass().addAll(listOf('\u000A', '\u000B', '\u000C', '\u000D', '\u0085', '\u2028', '\u2029'))
+                    ).apply {
+                        next = fSet
+                    }
+                    term = NonCapturingJointSet(listOf(lineBreakSequence, lineBreakChars), fSet)
+                }
+
                 Lexer.CHAR_PREVIOUS_MATCH -> {  // A previous match: \G
                     lexemes.next()
                     term = PreviousMatchSet()

diff --git a/libraries/stdlib/native-wasm/test/harmony_regex/PatternTest.kt b/libraries/stdlib/native-wasm/test/harmony_regex/PatternTest.kt
index ef8cccb..d7aea8c 100644
--- a/libraries/stdlib/native-wasm/test/harmony_regex/PatternTest.kt
+++ b/libraries/stdlib/native-wasm/test/harmony_regex/PatternTest.kt

@@ -25,6 +25,37 @@
     fun assertTrue(msg: String, value: Boolean) = assertTrue(value, msg)
     fun assertFalse(msg: String, value: Boolean) = assertFalse(value, msg)
 
+    private fun String.asEscapeSeq() = buildString {
+        this@asEscapeSeq.forEach {
+            when {
+                it.isLetterOrDigit() -> append(it)
+                it == '\n' -> append("\\n")
+                it == '\r' -> append("\\r")
+                it == '\t' -> append("\\t")
+                else -> {
+                    val hexCode = it.code.toString(16)
+                    append("\\u$hexCode")
+                }
+            }
+        }
+    }
+
+    private fun assertMatch(regex: Regex, string: String) {
+        assertTrue(regex.matches(string), "Regex `$regex` expected to match string `${string.asEscapeSeq()}`")
+    }
+
+    private fun assertNoMatch(regex: Regex, string: String) {
+        assertFalse(regex.matches(string), "Regex `$regex` expected to not match string `${string.asEscapeSeq()}`")
+    }
+
+    private fun assertFind(regex: Regex, string: String, expectedRange: IntRange) {
+        assertEquals(
+            expectedRange,
+            regex.find(string)?.range,
+            "Wrong `find` result for regex `$regex` in string `${string.asEscapeSeq()}`"
+        )
+    }
+
     internal var testPatterns = arrayOf("(a|b)*abb", "(1*2*3*4*)*567", "(a|b|c|d)*aab", "(1|2|3|4|5|6|7|8|9|0)(1|2|3|4|5|6|7|8|9|0)*", "(abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ)*", "(a|b)*(a|b)*A(a|b)*lice.*", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)(a|b|c|d|e|f|g|h|" + "i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)*(1|2|3|4|5|6|7|8|9|0)*|while|for|struct|if|do", "x(?c)y", "x(?cc)y", "x(?:c)y")
 
     @Test fun testCommentsInPattern() {
@@ -1260,4 +1291,323 @@
 
         assertNull(result)
     }
+
+    @Test fun testVerticalWhitespaceChar() {
+        // From Java 8+ `Pattern` doc:
+        // \v - A vertical whitespace character: [\n\x0B\f\r\x85\u2028\u2029]
+        // \V - A non-vertical whitespace character: [^\v]
+
+        val positiveRegex = Regex("\\v")
+        val negativeRegex = Regex("\\V")
+        val verticalWhitespaces = listOf("\n", "\u000B", "\u000C" /* aka \f */, "\r", "\u0085", "\u2028", "\u2029")
+        val nonVerticalWhitespaces = listOf("1", "K", " ", "${Char.MIN_HIGH_SURROGATE}${Char.MIN_LOW_SURROGATE}")
+
+        // Smoke tests
+        verticalWhitespaces.forEach {
+            assertMatch(positiveRegex, it)
+            assertNoMatch(negativeRegex, it)
+
+            assertFind(positiveRegex, "prefix$it", 6..6)
+            assertFind(positiveRegex, "prefix${it}suffix", 6..6)
+            assertFind(positiveRegex, "${it}suffix", 0..0)
+        }
+
+        nonVerticalWhitespaces.forEach {
+            assertNoMatch(positiveRegex, it)
+            assertMatch(negativeRegex, it)
+
+            assertFind(negativeRegex, "\n\n$it",  2 until (2 + it.length))
+            assertFind(negativeRegex, "\n\n${it}\n\n", 2 until (2 + it.length))
+            assertFind(negativeRegex, "${it}\n\n", 0 until it.length)
+        }
+
+        // Test that \v and \V can be a part of a more complex regex.
+        val neighbors = listOf(
+            // regex expression to its match.
+            "x" to "x",
+            " " to " ",
+            "\\n" to "\n",
+            "\\\\" to "\\",
+            "[abc]" to "a",
+            "." to "x",
+            "\\d" to "5",
+            "\\D" to "x",
+            "\\s" to " ",
+            "\\S" to "x",
+            "\\w" to "x",
+            "\\W" to "|",
+            "\\p{Alnum}" to "x",
+            "\\p{Space}" to " ",
+            "\\p{Blank}" to " ",
+            "\\p{Sc}" to "$"
+        ).flatMap { (expression, match) ->
+            listOf(
+                expression to match,
+                "$expression+" to match.repeat(2),
+                "$expression{2,4}" to match.repeat(3)
+            )
+        }
+
+        neighbors.forEach { (neighbor, match) ->
+            // \v
+            assertMatch(Regex("$neighbor\\v"), "$match\n")
+            assertMatch(Regex("$neighbor\\v$neighbor"), "$match\n$match")
+            assertMatch(Regex("\\v$neighbor"), "\n$match")
+
+            assertMatch(Regex("$neighbor\\v+"), "$match\n\n")
+            assertMatch(Regex("$neighbor\\v+$neighbor"), "$match\n\n$match")
+            assertMatch(Regex("\\v+$neighbor"), "\n\n$match")
+
+            assertMatch(Regex("$neighbor\\v{2,4}"), "$match\n\n\n")
+            assertMatch(Regex("$neighbor\\v{2,4}$neighbor"), "$match\n\n\n$match")
+            assertMatch(Regex("\\v{2,4}$neighbor"), "\n\n\n$match")
+
+            assertMatch(Regex("$neighbor\\v*"), match)
+            assertMatch(Regex("$neighbor\\v*$neighbor"), "$match$match")
+            assertMatch(Regex("\\v*$neighbor"), match)
+
+            // \V
+            assertMatch(Regex("$neighbor\\V"), "$match ")
+            assertMatch(Regex("$neighbor\\V$neighbor"), "$match $match")
+            assertMatch(Regex("\\V$neighbor"), " $match")
+
+            assertMatch(Regex("$neighbor\\V+"), "$match  ")
+            assertMatch(Regex("$neighbor\\V+$neighbor"), "$match  $match")
+            assertMatch(Regex("\\V+$neighbor"), "  $match")
+
+            assertMatch(Regex("$neighbor\\V{2,4}"), "$match    ")
+            assertMatch(Regex("$neighbor\\V{2,4}$neighbor"), "$match   $match")
+            assertMatch(Regex("\\V{2,4}$neighbor"), "   $match")
+
+            assertMatch(Regex("$neighbor\\V*"), match)
+            assertMatch(Regex("$neighbor\\V*$neighbor"), "$match$match")
+            assertMatch(Regex("\\V*$neighbor"), match)
+        }
+
+        // Backrefs and `or` expression
+        assertMatch(Regex("(\\v)x\\1"), "\nx\n")
+        assertMatch(Regex("\\v|x"), "\n")
+        assertMatch(Regex("\\v|x"), "x")
+
+        assertMatch(Regex("(\\V)\n\\1"), "x\nx")
+        assertMatch(Regex("\\V|\n"), "x")
+        assertMatch(Regex("\\V|\n"), "\n")
+
+        // Boundaries: ^, $, \b, \B
+        assertMatch(Regex("^\\vx"), "\nx")
+        assertNoMatch(Regex("^\\vx"), "x\n")
+        assertMatch(Regex("x\\v$"), "x\n")
+        assertNoMatch(Regex("x\\v$"), "\nx")
+        assertMatch(Regex("abc\\b\\v"), "abc\n")
+        assertMatch(Regex("abc\\b\\v"), "abc\n")
+
+        assertMatch(Regex("^\\V\n"), "x\n")
+        assertNoMatch(Regex("^\\V\n"), "\nx")
+        assertMatch(Regex("\n\\V$"), "\nx")
+        assertNoMatch(Regex("\n\\V$"), "x\n")
+        assertMatch(Regex("abc\\B\\V"), "abcd")
+        assertMatch(Regex("abc\\B\\V"), "abcd")
+    }
+
+    @Test fun testHorizontalWhitespaceChar() {
+        // From Java 8+ `Pattern` doc:
+        // \h - A horizontal whitespace character: [ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]
+        // \H - A non-horizontal whitespace character: [^\h]
+
+        val positiveRegex = Regex("\\h")
+        val negativeRegex = Regex("\\H")
+        val verticalWhitespaces = listOf(" ", "\t", "\u00A0", "\u1680", "\u180e", "\u202f", "\u205f", "\u3000") +
+                ('\u2000'..'\u200a').map(Char::toString)
+        val nonVerticalWhitespaces = listOf("1", "K", "\n", "${Char.MIN_HIGH_SURROGATE}${Char.MIN_LOW_SURROGATE}")
+
+        // Smoke tests
+        verticalWhitespaces.forEach {
+            assertMatch(positiveRegex, it)
+            assertNoMatch(negativeRegex, it)
+
+            assertFind(positiveRegex, "prefix$it", 6..6)
+            assertFind(positiveRegex, "prefix${it}suffix", 6..6)
+            assertFind(positiveRegex, "${it}suffix", 0..0)
+        }
+
+        nonVerticalWhitespaces.forEach {
+            assertNoMatch(positiveRegex, it)
+            assertMatch(negativeRegex, it)
+
+            assertFind(negativeRegex, "  $it", 2 until (2 + it.length))
+            assertFind(negativeRegex, "  $it  ", 2 until (2 + it.length))
+            assertFind(negativeRegex, "$it  ", 0 until it.length)
+        }
+
+        // Test that \h and \H can be a part of a more complex regex.
+        val neighbors = listOf(
+            // regex expression to its match.
+            "x" to "x",
+            " " to " ",
+            "\\n" to "\n",
+            "\\\\" to "\\",
+            "[abc]" to "a",
+            "." to "x",
+            "\\d" to "5",
+            "\\D" to "x",
+            "\\s" to " ",
+            "\\S" to "x",
+            "\\w" to "x",
+            "\\W" to "|",
+            "\\p{Alnum}" to "x",
+            "\\p{Space}" to " ",
+            "\\p{Blank}" to " ",
+            "\\p{Sc}" to "$"
+        ).flatMap { (expression, match) ->
+            listOf(
+                expression to match,
+                "$expression+" to match.repeat(2),
+                "$expression{2,4}" to match.repeat(3)
+            )
+        }
+
+        neighbors.forEach { (neighbor, match) ->
+            // \h
+            assertMatch(Regex("$neighbor\\h"), "$match ")
+            assertMatch(Regex("$neighbor\\h$neighbor"), "$match $match")
+            assertMatch(Regex("\\h$neighbor"), " $match")
+
+            assertMatch(Regex("$neighbor\\h+"), "$match  ")
+            assertMatch(Regex("$neighbor\\h+$neighbor"), "$match  $match")
+            assertMatch(Regex("\\h+$neighbor"), "  $match")
+
+            assertMatch(Regex("$neighbor\\h{2,4}"), "$match   ")
+            assertMatch(Regex("$neighbor\\h{2,4}$neighbor"), "$match   $match")
+            assertMatch(Regex("\\h{2,4}$neighbor"), "   $match")
+
+            assertMatch(Regex("$neighbor\\h*"), match)
+            assertMatch(Regex("$neighbor\\h*$neighbor"), "$match$match")
+            assertMatch(Regex("\\h*$neighbor"), match)
+
+            // \H
+            assertMatch(Regex("$neighbor\\H"), "$match\n")
+            assertMatch(Regex("$neighbor\\H$neighbor"), "$match\n$match")
+            assertMatch(Regex("\\H$neighbor"), "\n$match")
+
+            assertMatch(Regex("$neighbor\\H+"), "$match\n\n")
+            assertMatch(Regex("$neighbor\\H+$neighbor"), "$match\n\n$match")
+            assertMatch(Regex("\\H+$neighbor"), "\n\n$match")
+
+            assertMatch(Regex("$neighbor\\H{2,4}"), "$match\n\n\n")
+            assertMatch(Regex("$neighbor\\H{2,4}$neighbor"), "$match\n\n\n$match")
+            assertMatch(Regex("\\H{2,4}$neighbor"), "\n\n\n$match")
+
+            assertMatch(Regex("$neighbor\\H*"), match)
+            assertMatch(Regex("$neighbor\\H*$neighbor"), "$match$match")
+            assertMatch(Regex("\\H*$neighbor"), match)
+        }
+
+        // Backrefs and `or` expression
+        assertMatch(Regex("(\\h)x\\1"), " x ")
+        assertMatch(Regex("\\h|x"), " ")
+        assertMatch(Regex("\\h|x"), "x")
+
+        assertMatch(Regex("(\\H) \\1"), "x x")
+        assertMatch(Regex("\\H| "), "x")
+        assertMatch(Regex("\\H| "), " ")
+
+        // Boundaries: ^, $, \b, \B
+        assertMatch(Regex("^\\hx"), " x")
+        assertNoMatch(Regex("^\\hx"), "x ")
+        assertMatch(Regex("x\\h$"), "x ")
+        assertNoMatch(Regex("x\\h$"), " x")
+        assertMatch(Regex("abc\\b\\h"), "abc ")
+        assertMatch(Regex("abc\\b\\h"), "abc ")
+
+        assertMatch(Regex("^\\H "), "x ")
+        assertNoMatch(Regex("^\\H "), " x")
+        assertMatch(Regex(" \\H$"), " x")
+        assertNoMatch(Regex(" \\H$"), "x ")
+        assertMatch(Regex("abc\\B\\H"), "abcd")
+        assertMatch(Regex("abc\\B\\H"), "abcd")
+    }
+
+    @Test fun testUnicodeLinebreakChar() {
+        // From Java 8+ `Pattern` doc:
+        // \R - Any Unicode linebreak sequence, is equivalent to \u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]
+
+        val regex = Regex("\\R")
+        val linebreaks = listOf("\u000D\u000A", "\u000A", "\u000B", "\u000C", "\u000D", "\u0085", "\u2028", "\u2029")
+        val nonLinebreaks = listOf("1", "K", " ", "${Char.MIN_HIGH_SURROGATE}${Char.MIN_LOW_SURROGATE}")
+
+        // Smoke tests
+        linebreaks.forEach {
+            assertMatch(regex, it)
+
+            assertFind(regex, "prefix$it", 6 until (6 + it.length))
+            assertFind(regex, "prefix${it}suffix", 6 until (6 + it.length))
+            assertFind(regex, "${it}suffix", 0 until it.length)
+        }
+
+        nonLinebreaks.forEach {
+            assertNoMatch(regex, it)
+        }
+
+        // Test that \r\n matches both \R and \R\R
+        assertMatch(Regex("\\R\\R\\R"), "\r\r\n\n")
+        assertMatch(Regex("\\R\\R\\R\\R"), "\r\r\n\n")
+
+        // Test that \R can be a part of a more complex regex.
+        val neighbors = listOf(
+            // regex expression to its match.
+            "x" to "x",
+            " " to " ",
+            "\\n" to "\n",
+            "\\\\" to "\\",
+            "[abc]" to "a",
+            "." to "x",
+            "\\d" to "5",
+            "\\D" to "x",
+            "\\s" to " ",
+            "\\S" to "x",
+            "\\w" to "x",
+            "\\W" to "|",
+            "\\p{Alnum}" to "x",
+            "\\p{Space}" to " ",
+            "\\p{Blank}" to " ",
+            "\\p{Sc}" to "$"
+        ).flatMap { (expression, match) ->
+            listOf(
+                expression to match,
+                "$expression+" to match.repeat(2),
+                "$expression{2,4}" to match.repeat(3)
+            )
+        }
+
+        neighbors.forEach { (neighbor, match) ->
+            assertMatch(Regex("$neighbor\\R"), "$match\n")
+            assertMatch(Regex("$neighbor\\R$neighbor"), "$match\n$match")
+            assertMatch(Regex("\\R$neighbor"), "\n$match")
+
+            assertMatch(Regex("$neighbor\\R+"), "$match\n\n")
+            assertMatch(Regex("$neighbor\\R+$neighbor"), "$match\n\n$match")
+            assertMatch(Regex("\\R+$neighbor"), "\n\n$match")
+
+            assertMatch(Regex("$neighbor\\R{2,4}"), "$match\n\n\n")
+            assertMatch(Regex("$neighbor\\R{2,4}$neighbor"), "$match\n\n\n$match")
+            assertMatch(Regex("\\R{2,4}$neighbor"), "\n\n\n$match")
+
+            assertMatch(Regex("$neighbor\\R*"), match)
+            assertMatch(Regex("$neighbor\\R*$neighbor"), "$match$match")
+            assertMatch(Regex("\\R*$neighbor"), match)
+        }
+
+        // Backrefs and `or` expression
+        assertMatch(Regex("(\\R)x\\1"), "\nx\n")
+        assertMatch(Regex("\\R|x"), "\n")
+        assertMatch(Regex("\\R|x"), "x")
+
+        // Boundaries: ^, $, \b, \B
+        assertMatch(Regex("^\\Rx"), "\nx")
+        assertNoMatch(Regex("^\\Rx"), "x\n")
+        assertMatch(Regex("x\\R$"), "x\n")
+        assertNoMatch(Regex("x\\R$"), "\nx")
+        assertMatch(Regex("abc\\b\\R"), "abc\n")
+        assertMatch(Regex("abc\\b\\R"), "abc\n")
+    }
 }
commit	f7468cf9bcf03035ce7427e74b37597a595c89dc	[log] [tgz]
author	Ilya Matveev <ilya.matveev@jetbrains.com>	Thu Feb 17 16:26:45 2022 +0700
committer	Ilya Matveev <ilya.matveev@jetbrains.com>	Mon Feb 21 13:20:23 2022 +0700
tree	8e98cd6e605be97ead1cd6e0c725aa33df086bc8
parent	798a15c4509f3a25c9a32712d9f1a1a1761b9375 [diff]