Introduce new compile time flag: PB_VALIDATE_UTF8
Check whether incoming strings are valid UTF-8 sequences. Slows down
the string processing slightly and slightly increases code size.
diff --git a/docs/reference.rst b/docs/reference.rst
index 39bc611..82fb480 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -61,6 +61,9 @@
arrays. Such example is older protobuf.js.
PB_CONVERT_DOUBLE_FLOAT Convert doubles to floats for platforms that do
not support 64-bit doubles. Mainly AVR.
+PB_VALIDATE_UTF8 Check whether incoming strings are valid UTF-8
+ sequences. Slows down the string processing
+ slightly and slightly increases code size.
============================ ================================================
The PB_MAX_REQUIRED_FIELDS, PB_FIELD_16BIT and PB_FIELD_32BIT settings allow
diff --git a/pb.h b/pb.h
index 74cc7e9..9ef8019 100644
--- a/pb.h
+++ b/pb.h
@@ -49,6 +49,10 @@
* support 64-bit doubles. Most commonly AVR. */
/* #define PB_CONVERT_DOUBLE_FLOAT 1 */
+/* Check whether incoming strings are valid UTF-8 sequences. Slows down
+ * the string processing slightly and slightly increases code size. */
+/* #define PB_VALIDATE_UTF8 1 */
+
/******************************************************************
* You usually don't need to change anything below this line. *
* Feel free to look around and use the defined macros, though. *
diff --git a/pb_decode.c b/pb_decode.c
index 62a7d97..68351b5 100644
--- a/pb_decode.c
+++ b/pb_decode.c
@@ -1475,6 +1475,53 @@
return pb_read(stream, dest->bytes, (size_t)size);
}
+#ifdef PB_VALIDATE_UTF8
+
+/* adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c */
+
+static bool pb_validate_utf8(const uint8_t *s)
+{
+ while (*s) {
+ if (*s < 0x80)
+ /* 0xxxxxxx */
+ s++;
+ else if ((s[0] & 0xe0) == 0xc0) {
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0) /* overlong? */
+ return false;
+ else
+ s += 2;
+ } else if ((s[0] & 0xf0) == 0xe0) {
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+ return false;
+ else
+ s += 3;
+ } else if ((s[0] & 0xf8) == 0xf0) {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+ return false;
+ else
+ s += 4;
+ } else
+ return false;
+ }
+
+ return true;
+}
+
+#endif
+
static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field)
{
uint32_t size;
@@ -1507,7 +1554,18 @@
}
dest[size] = 0;
+
+#ifdef PB_VALIDATE_UTF8
+ if (!pb_read(stream, dest, (size_t)size))
+ return false;
+
+ if (!pb_validate_utf8((const uint8_t *)dest))
+ PB_RETURN_ERROR(stream, "invalid utf8");
+
+ return true;
+#else
return pb_read(stream, dest, (size_t)size);
+#endif
}
static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field)
diff --git a/tests/validate_utf8/SConscript b/tests/validate_utf8/SConscript
new file mode 100644
index 0000000..47b4178
--- /dev/null
+++ b/tests/validate_utf8/SConscript
@@ -0,0 +1,28 @@
+# Run the alltypes test case, but compile with PB_VALIDATE_UTF8=1
+
+Import("env")
+
+# Take copy of the files for custom build.
+c = Copy("$TARGET", "$SOURCE")
+env.Command("alltypes.pb.h", "$BUILD/alltypes/alltypes.pb.h", c)
+env.Command("alltypes.pb.c", "$BUILD/alltypes/alltypes.pb.c", c)
+env.Command("encode_alltypes.c", "$BUILD/alltypes/encode_alltypes.c", c)
+env.Command("decode_alltypes.c", "$BUILD/alltypes/decode_alltypes.c", c)
+
+# Define the compilation options
+opts = env.Clone()
+opts.Append(CPPDEFINES = {'PB_VALIDATE_UTF8': 1})
+
+# Build new version of core
+strict = opts.Clone()
+strict.Append(CFLAGS = strict['CORECFLAGS'])
+strict.Object("pb_decode_validateutf8.o", "$NANOPB/pb_decode.c")
+strict.Object("pb_encode_validateutf8.o", "$NANOPB/pb_encode.c")
+strict.Object("pb_common_validateutf8.o", "$NANOPB/pb_common.c")
+
+# Now build and run the test normally.
+enc = opts.Program(["encode_alltypes.c", "alltypes.pb.c", "pb_encode_validateutf8.o", "pb_common_validateutf8.o"])
+dec = opts.Program(["decode_alltypes.c", "alltypes.pb.c", "pb_decode_validateutf8.o", "pb_common_validateutf8.o"])
+
+env.RunTest(enc)
+env.RunTest([dec, "encode_alltypes.output"])