Introduce new compile time flag: PB_VALIDATE_UTF8 Check whether incoming strings are valid UTF-8 sequences. Slows down the string processing slightly and slightly increases code size.

commit: c00d4c1979c1d0dd481091c148f951e466eab9dc [log] [tgz]
author: Pavol Rusnak <pavol@rusnak.io> Mon Dec 16 23:44:50 2019 +0100
committer: Pavol Rusnak <pavol@rusnak.io> Tue Dec 17 14:36:29 2019 +0100
tree: ed0941940c5d763001bbdf0fab96953af5eb7ece
parent: 28706965251c26d5f9dab6defdca2e384cbf4da7 [diff]
diff --git a/docs/reference.rst b/docs/reference.rst
index 39bc611..82fb480 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst

@@ -61,6 +61,9 @@
                                arrays. Such example is older protobuf.js.
 PB_CONVERT_DOUBLE_FLOAT        Convert doubles to floats for platforms that do
                                not support 64-bit doubles. Mainly AVR.
+PB_VALIDATE_UTF8               Check whether incoming strings are valid UTF-8
+                               sequences. Slows down the string processing
+                               slightly and slightly increases code size.
 ============================  ================================================
 
 The PB_MAX_REQUIRED_FIELDS, PB_FIELD_16BIT and PB_FIELD_32BIT settings allow

diff --git a/pb.h b/pb.h
index 74cc7e9..9ef8019 100644
--- a/pb.h
+++ b/pb.h

@@ -49,6 +49,10 @@
  * support 64-bit doubles. Most commonly AVR. */
 /* #define PB_CONVERT_DOUBLE_FLOAT 1 */
 
+/* Check whether incoming strings are valid UTF-8 sequences. Slows down
+ * the string processing slightly and slightly increases code size. */
+/* #define PB_VALIDATE_UTF8 1 */
+
 /******************************************************************
  * You usually don't need to change anything below this line.     *
  * Feel free to look around and use the defined macros, though.   *

diff --git a/pb_decode.c b/pb_decode.c
index 62a7d97..68351b5 100644
--- a/pb_decode.c
+++ b/pb_decode.c

@@ -1475,6 +1475,53 @@
     return pb_read(stream, dest->bytes, (size_t)size);
 }
 
+#ifdef PB_VALIDATE_UTF8
+
+/* adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c */
+
+static bool pb_validate_utf8(const uint8_t *s)
+{
+  while (*s) {
+    if (*s < 0x80)
+      /* 0xxxxxxx */
+      s++;
+    else if ((s[0] & 0xe0) == 0xc0) {
+      /* 110XXXXx 10xxxxxx */
+      if ((s[1] & 0xc0) != 0x80 ||
+          (s[0] & 0xfe) == 0xc0)                        /* overlong? */
+        return false;
+      else
+        s += 2;
+    } else if ((s[0] & 0xf0) == 0xe0) {
+      /* 1110XXXX 10Xxxxxx 10xxxxxx */
+      if ((s[1] & 0xc0) != 0x80 ||
+          (s[2] & 0xc0) != 0x80 ||
+          (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
+          (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
+          (s[0] == 0xef && s[1] == 0xbf &&
+           (s[2] & 0xfe) == 0xbe))                      /* U+FFFE or U+FFFF? */
+        return false;
+      else
+        s += 3;
+    } else if ((s[0] & 0xf8) == 0xf0) {
+      /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+      if ((s[1] & 0xc0) != 0x80 ||
+          (s[2] & 0xc0) != 0x80 ||
+          (s[3] & 0xc0) != 0x80 ||
+          (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+          (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+        return false;
+      else
+        s += 4;
+    } else
+      return false;
+  }
+
+  return true;
+}
+
+#endif
+
 static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field)
 {
     uint32_t size;
@@ -1507,7 +1554,18 @@
     }
     
     dest[size] = 0;
+
+#ifdef PB_VALIDATE_UTF8
+    if (!pb_read(stream, dest, (size_t)size))
+        return false;
+
+    if (!pb_validate_utf8((const uint8_t *)dest))
+        PB_RETURN_ERROR(stream, "invalid utf8");
+
+    return true;
+#else
     return pb_read(stream, dest, (size_t)size);
+#endif
 }
 
 static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field)

diff --git a/tests/validate_utf8/SConscript b/tests/validate_utf8/SConscript
new file mode 100644
index 0000000..47b4178
--- /dev/null
+++ b/tests/validate_utf8/SConscript

@@ -0,0 +1,28 @@
+# Run the alltypes test case, but compile with PB_VALIDATE_UTF8=1
+
+Import("env")
+
+# Take copy of the files for custom build.
+c = Copy("$TARGET", "$SOURCE")
+env.Command("alltypes.pb.h", "$BUILD/alltypes/alltypes.pb.h", c)
+env.Command("alltypes.pb.c", "$BUILD/alltypes/alltypes.pb.c", c)
+env.Command("encode_alltypes.c", "$BUILD/alltypes/encode_alltypes.c", c)
+env.Command("decode_alltypes.c", "$BUILD/alltypes/decode_alltypes.c", c)
+
+# Define the compilation options
+opts = env.Clone()
+opts.Append(CPPDEFINES = {'PB_VALIDATE_UTF8': 1})
+
+# Build new version of core
+strict = opts.Clone()
+strict.Append(CFLAGS = strict['CORECFLAGS'])
+strict.Object("pb_decode_validateutf8.o", "$NANOPB/pb_decode.c")
+strict.Object("pb_encode_validateutf8.o", "$NANOPB/pb_encode.c")
+strict.Object("pb_common_validateutf8.o", "$NANOPB/pb_common.c")
+
+# Now build and run the test normally.
+enc = opts.Program(["encode_alltypes.c", "alltypes.pb.c", "pb_encode_validateutf8.o", "pb_common_validateutf8.o"])
+dec = opts.Program(["decode_alltypes.c", "alltypes.pb.c", "pb_decode_validateutf8.o", "pb_common_validateutf8.o"])
+
+env.RunTest(enc)
+env.RunTest([dec, "encode_alltypes.output"])
commit	c00d4c1979c1d0dd481091c148f951e466eab9dc	[log] [tgz]
author	Pavol Rusnak <pavol@rusnak.io>	Mon Dec 16 23:44:50 2019 +0100
committer	Pavol Rusnak <pavol@rusnak.io>	Tue Dec 17 14:36:29 2019 +0100
tree	ed0941940c5d763001bbdf0fab96953af5eb7ece
parent	28706965251c26d5f9dab6defdca2e384cbf4da7 [diff]