Add UTF-8 handling to ProtoStr This mostly uses a copied `Utf8Chunks` utility from nightly Rust. PiperOrigin-RevId: 551224417

commit: 4b0e76370b3abcfc8ee749ac38d75ecef84bed83 [log] [tgz]
author: Protobuf Team Bot <protobuf-github-bot@google.com> Wed Jul 26 09:09:44 2023 -0700
committer: Copybara-Service <copybara-worker@google.com> Wed Jul 26 09:17:38 2023 -0700
tree: c579c93b39e18778e08c79057e77acace2195554
parent: caf55184b2d0e8cbb99e5b487b453dc8721af4fe [diff] [blame]
diff --git a/rust/string.rs b/rust/string.rs
index fe9d827..143d284 100644
--- a/rust/string.rs
+++ b/rust/string.rs

@@ -41,6 +41,7 @@
 use std::hash::{Hash, Hasher};
 use std::iter;
 use std::ops::{Deref, DerefMut};
+use utf8::Utf8Chunks;
 
 /// This type will be replaced by something else in a future revision.
 // TODO(b/285309330): remove this and any `impl`s using it.
@@ -251,7 +252,7 @@
 }
 
 /// The bytes were not valid UTF-8.
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub struct Utf8Error(pub(crate) ());
 
 impl From<std::str::Utf8Error> for Utf8Error {
@@ -355,16 +356,34 @@
     ///
     /// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
     pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
-        todo!("b/285309330: requires UTF-8 chunk splitting");
-        ['a'].into_iter() // necessary for `impl Trait` to compile
+        Utf8Chunks::new(self.as_bytes()).flat_map(|chunk| {
+            let mut yield_replacement_char = !chunk.invalid().is_empty();
+            chunk.valid().chars().chain(iter::from_fn(move || {
+                // Yield a single replacement character for every
+                // non-empty invalid sequence.
+                yield_replacement_char.then(|| {
+                    yield_replacement_char = false;
+                    char::REPLACEMENT_CHARACTER
+                })
+            }))
+        })
     }
 
     /// Returns an iterator over chunks of UTF-8 data in the string.
     ///
     /// An `Ok(&str)` is yielded for every valid UTF-8 chunk, and an
-    /// `Err(&[u8])` for non-UTF-8 chunks.
-    pub fn utf8_chunks(&self) -> Todo<'_> {
-        todo!("b/285309330: requires UTF-8 chunk splitting");
+    /// `Err(&[u8])` for each non-UTF-8 chunk. An `Err` will be emitted
+    /// multiple times in a row for contiguous invalid chunks. Each invalid
+    /// chunk in an `Err` has a maximum length of 3 bytes.
+    pub fn utf8_chunks(&self) -> impl Iterator<Item = Result<&str, &[u8]>> + '_ {
+        Utf8Chunks::new(self.as_bytes()).flat_map(|chunk| {
+            let valid = chunk.valid();
+            let invalid = chunk.invalid();
+            (!valid.is_empty())
+                .then_some(Ok(valid))
+                .into_iter()
+                .chain((!invalid.is_empty()).then_some(Err(invalid)))
+        })
     }
 
     /// Converts known-UTF-8 bytes to a `ProtoStr` without a check.
@@ -407,14 +426,22 @@
 }
 
 impl fmt::Debug for ProtoStr {
-    fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        todo!("b/285309330: requires UTF-8 chunk splitting")
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&Utf8Chunks::new(self.as_bytes()).debug(), f)
     }
 }
 
 impl fmt::Display for ProtoStr {
-    fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        todo!("b/285309330: requires UTF-8 chunk splitting")
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use std::fmt::Write as _;
+        for chunk in Utf8Chunks::new(self.as_bytes()) {
+            fmt::Display::fmt(chunk.valid(), f)?;
+            if !chunk.invalid().is_empty() {
+                // One invalid chunk is emitted per detected invalid sequence.
+                f.write_char(char::REPLACEMENT_CHARACTER)?;
+            }
+        }
+        Ok(())
     }
 }
 
@@ -466,5 +493,207 @@
 
 #[cfg(test)]
 mod tests {
+    use super::*;
+
     // TODO(b/285309330): Add unit tests
+
+    // Shorter and safe utility function to construct `ProtoStr` from bytes for
+    // testing.
+    fn test_proto_str(bytes: &[u8]) -> &ProtoStr {
+        // SAFETY: The runtime that this test executes under does not elide UTF-8 checks
+        // inside of `ProtoStr`.
+        unsafe { ProtoStr::from_utf8_unchecked(bytes) }
+    }
+
+    // UTF-8 test cases copied from:
+    // https://github.com/rust-lang/rust/blob/e8ee0b7/library/core/tests/str_lossy.rs
+
+    #[test]
+    fn proto_str_debug() {
+        assert_eq!(&format!("{:?}", test_proto_str(b"Hello There")), "\"Hello There\"");
+        assert_eq!(
+            &format!(
+                "{:?}",
+                test_proto_str(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa"),
+            ),
+            "\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
+        );
+    }
+
+    #[test]
+    fn proto_str_display() {
+        assert_eq!(&test_proto_str(b"Hello There").to_string(), "Hello There");
+        assert_eq!(
+            &test_proto_str(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").to_string(),
+            "Hello�� There� Goodbye\u{10d4ea}",
+        );
+    }
+
+    #[test]
+    fn proto_str_to_rust_str() {
+        assert_eq!(test_proto_str(b"hello").to_str(), Ok("hello"));
+        assert_eq!(test_proto_str("ศไทย中华Việt Nam".as_bytes()).to_str(), Ok("ศไทย中华Việt Nam"));
+        for expect_fail in [
+            &b"Hello\xC2 There\xFF Goodbye"[..],
+            b"Hello\xC0\x80 There\xE6\x83 Goodbye",
+            b"\xF5foo\xF5\x80bar",
+            b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
+            b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
+            b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
+            b"\xED\xA0\x80foo\xED\xBF\xBFbar",
+        ] {
+            assert_eq!(test_proto_str(expect_fail).to_str(), Err(Utf8Error(())), "{expect_fail:?}");
+        }
+    }
+
+    #[test]
+    fn proto_str_to_cow() {
+        assert_eq!(test_proto_str(b"hello").to_cow_lossy(), Cow::Borrowed("hello"));
+        assert_eq!(
+            test_proto_str("ศไทย中华Việt Nam".as_bytes()).to_cow_lossy(),
+            Cow::Borrowed("ศไทย中华Việt Nam")
+        );
+        for (bytes, lossy_str) in [
+            (&b"Hello\xC2 There\xFF Goodbye"[..], "Hello� There� Goodbye"),
+            (b"Hello\xC0\x80 There\xE6\x83 Goodbye", "Hello�� There� Goodbye"),
+            (b"\xF5foo\xF5\x80bar", "�foo��bar"),
+            (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "�foo�bar�baz"),
+            (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "�foo�bar��baz"),
+            (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "����foo\u{10000}bar"),
+            (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "���foo���bar"),
+        ] {
+            let cow = test_proto_str(bytes).to_cow_lossy();
+            assert!(matches!(cow, Cow::Owned(_)));
+            assert_eq!(&*cow, lossy_str, "{bytes:?}");
+        }
+    }
+
+    #[test]
+    fn proto_str_utf8_chunks() {
+        macro_rules! assert_chunks {
+            ($bytes:expr, $($chunks:expr),* $(,)?) => {
+                let bytes = $bytes;
+                let chunks: &[Result<&str, &[u8]>] = &[$($chunks),*];
+                let s = test_proto_str(bytes);
+                let mut got_chunks = s.utf8_chunks();
+                let mut expected_chars = chunks.iter().copied();
+                assert!(got_chunks.eq(expected_chars), "{bytes:?} -> {chunks:?}");
+            };
+        }
+        assert_chunks!(b"hello", Ok("hello"));
+        assert_chunks!("ศไทย中华Việt Nam".as_bytes(), Ok("ศไทย中华Việt Nam"));
+        assert_chunks!(
+            b"Hello\xC2 There\xFF Goodbye",
+            Ok("Hello"),
+            Err(b"\xC2"),
+            Ok(" There"),
+            Err(b"\xFF"),
+            Ok(" Goodbye"),
+        );
+        assert_chunks!(
+            b"Hello\xC0\x80 There\xE6\x83 Goodbye",
+            Ok("Hello"),
+            Err(b"\xC0"),
+            Err(b"\x80"),
+            Ok(" There"),
+            Err(b"\xE6\x83"),
+            Ok(" Goodbye"),
+        );
+        assert_chunks!(
+            b"\xF5foo\xF5\x80bar",
+            Err(b"\xF5"),
+            Ok("foo"),
+            Err(b"\xF5"),
+            Err(b"\x80"),
+            Ok("bar"),
+        );
+        assert_chunks!(
+            b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
+            Err(b"\xF1"),
+            Ok("foo"),
+            Err(b"\xF1\x80"),
+            Ok("bar"),
+            Err(b"\xF1\x80\x80"),
+            Ok("baz"),
+        );
+        assert_chunks!(
+            b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
+            Err(b"\xF4"),
+            Ok("foo"),
+            Err(b"\xF4\x80"),
+            Ok("bar"),
+            Err(b"\xF4"),
+            Err(b"\xBF"),
+            Ok("baz"),
+        );
+        assert_chunks!(
+            b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
+            Err(b"\xF0"),
+            Err(b"\x80"),
+            Err(b"\x80"),
+            Err(b"\x80"),
+            Ok("foo\u{10000}bar"),
+        );
+        assert_chunks!(
+            b"\xED\xA0\x80foo\xED\xBF\xBFbar",
+            Err(b"\xED"),
+            Err(b"\xA0"),
+            Err(b"\x80"),
+            Ok("foo"),
+            Err(b"\xED"),
+            Err(b"\xBF"),
+            Err(b"\xBF"),
+            Ok("bar"),
+        );
+    }
+
+    #[test]
+    fn proto_str_chars() {
+        macro_rules! assert_chars {
+            ($bytes:expr, $chars:expr) => {
+                let bytes = $bytes;
+                let chars = $chars;
+                let s = test_proto_str(bytes);
+                let mut got_chars = s.chars();
+                let mut expected_chars = chars.into_iter();
+                assert!(got_chars.eq(expected_chars), "{bytes:?} -> {chars:?}");
+            };
+        }
+        assert_chars!(b"hello", ['h', 'e', 'l', 'l', 'o']);
+        assert_chars!(
+            "ศไทย中华Việt Nam".as_bytes(),
+            ['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm']
+        );
+        assert_chars!(
+            b"Hello\xC2 There\xFF Goodbye",
+            [
+                'H', 'e', 'l', 'l', 'o', '�', ' ', 'T', 'h', 'e', 'r', 'e', '�', ' ', 'G', 'o',
+                'o', 'd', 'b', 'y', 'e'
+            ]
+        );
+        assert_chars!(
+            b"Hello\xC0\x80 There\xE6\x83 Goodbye",
+            [
+                'H', 'e', 'l', 'l', 'o', '�', '�', ' ', 'T', 'h', 'e', 'r', 'e', '�', ' ', 'G',
+                'o', 'o', 'd', 'b', 'y', 'e'
+            ]
+        );
+        assert_chars!(b"\xF5foo\xF5\x80bar", ['�', 'f', 'o', 'o', '�', '�', 'b', 'a', 'r']);
+        assert_chars!(
+            b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
+            ['�', 'f', 'o', 'o', '�', 'b', 'a', 'r', '�', 'b', 'a', 'z']
+        );
+        assert_chars!(
+            b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
+            ['�', 'f', 'o', 'o', '�', 'b', 'a', 'r', '�', '�', 'b', 'a', 'z']
+        );
+        assert_chars!(
+            b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
+            ['�', '�', '�', '�', 'f', 'o', 'o', '\u{10000}', 'b', 'a', 'r']
+        );
+        assert_chars!(
+            b"\xED\xA0\x80foo\xED\xBF\xBFbar",
+            ['�', '�', '�', 'f', 'o', 'o', '�', '�', '�', 'b', 'a', 'r']
+        );
+    }
 }
commit	4b0e76370b3abcfc8ee749ac38d75ecef84bed83	[log] [tgz]
author	Protobuf Team Bot <protobuf-github-bot@google.com>	Wed Jul 26 09:09:44 2023 -0700
committer	Copybara-Service <copybara-worker@google.com>	Wed Jul 26 09:17:38 2023 -0700
tree	c579c93b39e18778e08c79057e77acace2195554
parent	caf55184b2d0e8cbb99e5b487b453dc8721af4fe [diff] [blame]