| /* |
| * Copyright 2010-2019 JetBrains s.r.o. and Kotlin Programming Language contributors. |
| * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file. |
| */ |
| |
| package test.text |
| |
| import test.assertArrayContentEquals |
| import kotlin.test.* |
| |
| // When decoding utf-8, JVM and JS implementations replace the sequence reflecting a surrogate code point differently. |
| // JS replaces each byte of the sequence by the replacement char, whereas JVM replaces the whole sequence with a single replacement char. |
| // See corresponding actual to find out the replacement. |
| internal expect val surrogateCodePointDecoding: String |
| |
| // The byte sequence used to replace a surrogate char. |
| // JVM default replacement sequence consist of single 0x3F byte. |
| // JS and Native replacement byte sequence is [0xEF, 0xBF, 0xBD]. |
| internal expect val surrogateCharEncoding: ByteArray |
| |
| class StringEncodingTest { |
| private fun bytes(vararg elements: Int) = ByteArray(elements.size) { elements[it].toByte() } |
| |
| private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String) { |
| assertArrayContentEquals(expected, string.encodeToByteArray()) |
| if (!isWellFormed) { |
| assertFailsWith<CharacterCodingException> { string.encodeToByteArray(throwOnInvalidSequence = true) } |
| } else { |
| assertArrayContentEquals(expected, string.encodeToByteArray(throwOnInvalidSequence = true)) |
| assertEquals(string, string.encodeToByteArray(throwOnInvalidSequence = true).decodeToString()) |
| } |
| } |
| |
| private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String, startIndex: Int, endIndex: Int) { |
| assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex)) |
| if (!isWellFormed) { |
| assertFailsWith<CharacterCodingException> { string.encodeToByteArray(startIndex, endIndex, true) } |
| } else { |
| assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex, true)) |
| assertEquals( |
| string.substring(startIndex, endIndex), |
| string.encodeToByteArray(startIndex, endIndex, true).decodeToString() |
| ) |
| } |
| } |
| |
| // https://youtrack.jetbrains.com/issue/KT-31614 |
| private fun string(vararg codeUnits: Int): String { |
| return buildString { codeUnits.forEach { append(Char(it)) } } |
| } |
| |
| @Test |
| fun encodeToByteArray() { |
| // empty string |
| testEncoding(true, bytes(), "") |
| |
| // 1-byte chars |
| testEncoding(true, bytes(0), "\u0000") |
| testEncoding(true, bytes(0x2D), "-") |
| testEncoding(true, bytes(0x7F), "\u007F") |
| |
| // 2-byte chars |
| testEncoding(true, bytes(0xC2, 0x80), "\u0080") |
| testEncoding(true, bytes(0xC2, 0xBF), "¿") |
| testEncoding(true, bytes(0xDF, 0xBF), "\u07FF") |
| |
| // 3-byte chars |
| testEncoding(true, bytes(0xE0, 0xA0, 0x80), "\u0800") |
| testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤") |
| testEncoding(true, bytes(0xED, 0x9F, 0xBF), "\uD7FF") |
| |
| // surrogate chars |
| testEncoding(false, surrogateCharEncoding, string(0xD800)) |
| testEncoding(false, surrogateCharEncoding, string(0xDB6A)) |
| testEncoding(false, surrogateCharEncoding, string(0xDFFF)) |
| |
| // 3-byte chars |
| testEncoding(true, bytes(0xEE, 0x80, 0x80), "\uE000") |
| testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C") |
| testEncoding(true, bytes(0xEF, 0xBF, 0xBF), "\uFFFF") |
| |
| // 4-byte surrogate pairs |
| testEncoding(true, bytes(0xF0, 0x90, 0x80, 0x80), "\uD800\uDC00") |
| testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC") |
| testEncoding(true, bytes(0xF4, 0x8F, 0xBF, 0xBF), "\uDBFF\uDFFF") |
| |
| // reversed surrogate pairs |
| testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDC00, 0xD800)) |
| testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDDFC, 0xDA49)) |
| testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDFFF, 0xDBFF)) |
| |
| testEncoding( |
| false, |
| bytes( |
| 0, /**/ 0x2D, /**/ 0x7F, /**/ 0xC2, 0x80, /**/ 0xC2, 0xBF, /**/ 0xDF, 0xBF, /**/ 0xE0, 0xA0, 0x80, /**/ |
| 0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A |
| ) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding, |
| "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz" + string(0xDFFF, 0xD800, 0x7A, 0xDB6A, 0x7A, 0xDB6A) |
| ) |
| |
| testEncoding( |
| true, |
| bytes( |
| 0xEE, 0x80, 0x80, /**/ 0xEF, 0x98, 0xBC, /**/ 0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/ |
| 0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC, /**/ 0xF4, 0x8F, 0xBF, 0xBF |
| ), |
| "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF" |
| ) |
| |
| val longChars = CharArray(200_000) { 'k' } |
| val longBytes = longChars.concatToString().encodeToByteArray() |
| assertEquals(200_000, longBytes.size) |
| assertTrue { longBytes.all { it == 0x6B.toByte() } } |
| } |
| |
| @Test |
| fun encodeToByteArraySlice() { |
| assertFailsWith<IllegalArgumentException> { "".encodeToByteArray(startIndex = 1) } |
| assertFailsWith<IllegalArgumentException> { "123".encodeToByteArray(startIndex = 10) } |
| assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(startIndex = -1) } |
| assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(endIndex = 10) } |
| assertFailsWith<IllegalArgumentException> { "123".encodeToByteArray(endIndex = -1) } |
| assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(startIndex = 5, endIndex = 10) } |
| assertFailsWith<IllegalArgumentException> { "123".encodeToByteArray(startIndex = 5, endIndex = 2) } |
| assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(startIndex = 1, endIndex = 4) } |
| |
| testEncoding(true, bytes(), "abc", 0, 0) |
| testEncoding(true, bytes(), "abc", 3, 3) |
| testEncoding(true, bytes(0x62, 0x63), "abc", 1, 3) |
| testEncoding(true, bytes(0x61, 0x62), "abc", 0, 2) |
| testEncoding(true, bytes(0x62), "abc", 1, 2) |
| |
| testEncoding(true, bytes(0x2D), "-", 0, 1) |
| testEncoding(true, bytes(0xC2, 0xBF), "¿", 0, 1) |
| testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤", 0, 1) |
| |
| testEncoding(false, surrogateCharEncoding, string(0xDB6A), 0, 1) |
| |
| testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C", 0, 1) |
| |
| testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC", 0, 2) |
| testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 0, 1) |
| testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 1, 2) |
| |
| testEncoding( |
| false, |
| bytes(0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding, |
| "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz" + string(0xDFFF, 0xD800, 0x7A, 0xDB6A, 0x7A, 0xDB6A), |
| startIndex = 7, |
| endIndex = 12 |
| ) |
| |
| testEncoding( |
| false, |
| bytes(0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/ 0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC) /**/ + surrogateCharEncoding, |
| "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF", |
| startIndex = 2, |
| endIndex = 9 |
| ) |
| |
| val longChars = CharArray(200_000) { 'k' } |
| val longBytes = longChars.concatToString().encodeToByteArray(startIndex = 5000, endIndex = 195_000) |
| assertEquals(190_000, longBytes.size) |
| assertTrue { longBytes.all { it == 0x6B.toByte() } } |
| } |
| |
| private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray) { |
| assertEquals(expected, bytes.decodeToString()) |
| if (!isWellFormed) { |
| assertFailsWith<CharacterCodingException> { bytes.decodeToString(throwOnInvalidSequence = true) } |
| } else { |
| assertEquals(expected, bytes.decodeToString(throwOnInvalidSequence = true)) |
| assertArrayContentEquals(bytes, bytes.decodeToString(throwOnInvalidSequence = true).encodeToByteArray()) |
| } |
| } |
| |
| private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray, startIndex: Int, endIndex: Int) { |
| assertEquals(expected, bytes.decodeToString(startIndex, endIndex)) |
| if (!isWellFormed) { |
| assertFailsWith<CharacterCodingException> { bytes.decodeToString(startIndex, endIndex, true) } |
| } else { |
| assertEquals(expected, bytes.decodeToString(startIndex, endIndex, true)) |
| assertArrayContentEquals( |
| bytes.sliceArray(startIndex until endIndex), |
| bytes.decodeToString(startIndex, endIndex, true).encodeToByteArray() |
| ) |
| } |
| } |
| |
| private fun truncatedSurrogateDecoding() = |
| surrogateCodePointDecoding.let { if (it.length > 1) it.dropLast(1) else it } |
| |
| @Test |
| fun decodeToString() { |
| testDecoding(true, "", bytes()) // empty |
| testDecoding(true, "\u0000", bytes(0x0)) // null char |
| testDecoding(true, "zC", bytes(0x7A, 0x43)) // 1-byte chars |
| |
| testDecoding(false, "��", bytes(0x85, 0xAF)) // invalid bytes starting with 1 bit |
| testDecoding(true, "¿", bytes(0xC2, 0xBF)) // 2-byte char |
| testDecoding(false, "�z", bytes(0xCF, 0x7A)) // 2-byte char, second byte starts with 0 bit |
| testDecoding(false, "��", bytes(0xC1, 0xAA)) // 1-byte char written in two bytes |
| |
| testDecoding(false, "�z", bytes(0xEF, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit |
| testDecoding(false, "���", bytes(0xE0, 0x9F, 0xAF)) // 2-byte char written in three bytes |
| testDecoding(false, "�z", bytes(0xE0, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit |
| testDecoding(true, "\u1FFF", bytes(0xE1, 0xBF, 0xBF)) // 3-byte char |
| |
| testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF)) // 3-byte high-surrogate char |
| testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xB3, 0x9A)) // 3-byte low-surrogate char |
| testDecoding( |
| false, |
| surrogateCodePointDecoding + surrogateCodePointDecoding, |
| bytes(0xED, 0xAF, 0xBF, /**/ 0xED, 0xB3, 0x9A) |
| ) // surrogate pair chars |
| testDecoding(false, "�z", bytes(0xEF, 0x7A)) // 3-byte char, second byte starts with 0 bit, third byte missing |
| |
| testDecoding(false, "�����", bytes(0xF9, 0x94, 0x80, 0x80, 0x80)) // 5-byte code point larger than 0x10FFFF |
| testDecoding(false, "������", bytes(0xFD, 0x94, 0x80, 0x80, 0x80, 0x80)) // 6-byte code point larger than 0x10FFFF |
| |
| // Ill-Formed Sequences for Surrogates |
| testDecoding( |
| false, |
| surrogateCodePointDecoding + surrogateCodePointDecoding + truncatedSurrogateDecoding() + "A", |
| bytes(0xED, 0xA0, 0x80, /**/ 0xED, 0xBF, 0xBF, /**/ 0xED, 0xAF, /**/ 0x41) |
| ) |
| // Truncated Sequences |
| testDecoding(false, "����A", bytes(0xE1, 0x80, /**/ 0xE2, /**/ 0xF0, 0x91, 0x92, /**/ 0xF1, 0xBF, /**/ 0x41)) |
| |
| testDecoding(false, "�", bytes(0xE0, 0xAF)) // 3-byte char, third byte missing |
| |
| testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F)) // 4-byte char |
| testDecoding(false, "����", bytes(0xF0, 0x8F, 0x9F, 0x9F)) // 3-byte char written in four bytes |
| testDecoding(false, "����", bytes(0xF4, 0x9F, 0x9F, 0x9F)) // 4-byte code point larger than 0x10FFFF |
| testDecoding(false, "����", bytes(0xF5, 0x80, 0x80, 0x80)) // 4-byte code point larger than 0x10FFFF |
| |
| // Non-Shortest Form Sequences |
| testDecoding(false, "��������A", bytes(0xC0, 0xAF, /**/ 0xE0, 0x80, 0xBF, /**/ 0xF0, 0x81, 0x82, /**/ 0x41)) |
| // Other Ill-Formed Sequences |
| testDecoding(false, "�����A��B", bytes(0xF4, 0x91, 0x92, 0x93, /**/ 0xFF, /**/ 0x41, /**/ 0x80, 0xBF, /**/ 0x42)) |
| |
| val longBytes = ByteArray(200_000) { 0x6B.toByte() } |
| val longString = longBytes.decodeToString() |
| assertEquals(200_000, longString.length) |
| assertTrue { longString.all { it == 'k' } } |
| } |
| |
| @Test |
| fun decodeToStringSlice() { |
| assertFailsWith<IllegalArgumentException> { bytes().decodeToString(1, 0) } |
| assertFailsWith<IllegalArgumentException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 10) } |
| assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = -1) } |
| assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = 10) } |
| assertFailsWith<IllegalArgumentException> { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = -1) } |
| assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 10) } |
| assertFailsWith<IllegalArgumentException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 2) } |
| assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 1, endIndex = 4) } |
| |
| testDecoding(true, "", bytes(), startIndex = 0, endIndex = 0) |
| testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 0) |
| testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 3, endIndex = 3) |
| testDecoding(true, "abc", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 3) |
| testDecoding(true, "ab", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 2) |
| testDecoding(true, "bc", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 3) |
| testDecoding(true, "b", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 2) |
| |
| testDecoding(true, "¿", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 2) |
| testDecoding(false, "�", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 1) |
| testDecoding(false, "�", bytes(0xC2, 0xBF), startIndex = 1, endIndex = 2) |
| |
| testDecoding(false, "�", bytes(0xEF, 0xAF, 0x7A), startIndex = 0, endIndex = 2) |
| testDecoding(false, "�z", bytes(0xEF, 0xAF, 0x7A), startIndex = 1, endIndex = 3) |
| testDecoding(true, "z", bytes(0xEF, 0xAF, 0x7A), startIndex = 2, endIndex = 3) |
| |
| testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF), startIndex = 0, endIndex = 3) |
| testDecoding(false, truncatedSurrogateDecoding(), bytes(0xED, 0xB3, 0x9A), startIndex = 0, endIndex = 2) |
| testDecoding(false, "���", bytes(0xED, 0xAF, 0xBF, 0xED, 0xB3, 0x9A), startIndex = 1, endIndex = 4) |
| testDecoding(false, "�", bytes(0xEF, 0x7A), startIndex = 0, endIndex = 1) |
| testDecoding(true, "z", bytes(0xEF, 0x7A), startIndex = 1, endIndex = 2) |
| |
| testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 0, endIndex = 4) |
| testDecoding(false, "��", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 2, endIndex = 4) |
| testDecoding(false, "��", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 1, endIndex = 3) |
| |
| val longBytes = ByteArray(200_000) { 0x6B.toByte() } |
| val longString = longBytes.decodeToString(startIndex = 5000, endIndex = 195_000) |
| assertEquals(190_000, longString.length) |
| assertTrue { longString.all { it == 'k' } } |
| } |
| |
| @Test |
| fun kotlinxIOUnicodeTest() { |
| fun String.readHex(): ByteArray = split(" ") |
| .filter { it.isNotBlank() } |
| .map { it.toInt(16).toByte() } |
| .toByteArray() |
| |
| val smokeTestData = "\ud83c\udf00" |
| val smokeTestDataCharArray: CharArray = smokeTestData.toCharArray() |
| val smokeTestDataAsBytes = "f0 9f 8c 80".readHex() |
| |
| val testData = "file content with unicode " + |
| "\ud83c\udf00 :" + |
| " \u0437\u0434\u043e\u0440\u043e\u0432\u0430\u0442\u044c\u0441\u044f :" + |
| " \uc5ec\ubcf4\uc138\uc694 :" + |
| " \u4f60\u597d :" + |
| " \u00f1\u00e7" |
| val testDataCharArray: CharArray = testData.toCharArray() |
| val testDataAsBytes: ByteArray = ("66 69 6c 65 20 63 6f 6e 74 65 6e 74 20 77 69 74 " + |
| " 68 20 75 6e 69 63 6f 64 65 20 f0 9f 8c 80 20 3a 20 d0 b7 d0 b4 d0 be d1 " + |
| "80 d0 be d0 b2 d0 b0 d1 82 d1 8c d1 81 d1 8f 20 3a 20 ec 97 ac eb b3 b4 ec " + |
| " 84 b8 ec 9a 94 20 3a 20 e4 bd a0 e5 a5 bd 20 3a 20 c3 b1 c3 a7").readHex() |
| |
| |
| assertArrayContentEquals(smokeTestDataAsBytes, smokeTestData.encodeToByteArray()) |
| assertArrayContentEquals(testDataAsBytes, testData.encodeToByteArray()) |
| |
| assertEquals(smokeTestData, smokeTestDataAsBytes.decodeToString()) |
| assertEquals(testData, testDataAsBytes.decodeToString()) |
| |
| assertEquals(smokeTestData, smokeTestDataCharArray.concatToString()) |
| assertEquals(testData, testDataCharArray.concatToString()) |
| |
| assertArrayContentEquals(smokeTestDataCharArray, smokeTestData.toCharArray()) |
| assertArrayContentEquals(testDataCharArray, testData.toCharArray()) |
| |
| assertArrayContentEquals(smokeTestDataAsBytes, smokeTestDataCharArray.concatToString().encodeToByteArray()) |
| assertArrayContentEquals(testDataAsBytes, testDataCharArray.concatToString().encodeToByteArray()) |
| |
| assertArrayContentEquals(smokeTestDataCharArray, smokeTestDataAsBytes.decodeToString().toCharArray()) |
| assertArrayContentEquals(testDataCharArray, testDataAsBytes.decodeToString().toCharArray()) |
| |
| assertEquals("\uD858\uDE18\n", bytes(0xF0, 0xA6, 0x88, 0x98, 0x0a).decodeToString()) |
| assertEquals("\u0BF5\n", bytes(0xE0, 0xAF, 0xB5, 0x0A).decodeToString()) |
| assertEquals("\u041a\n", bytes(0xD0, 0x9A, 0x0A).decodeToString()) |
| } |
| } |