Move pb_validate_utf8() to pb_common.c (#437) The validation function can be useful for user code also. Also made the encoding check validity also, because it's better to catch invalid messages in the transmitting end. Added unittests for the validation.

commit: 9c0130ddf429db380f0224ecabd2c98d24e17b6d [log] [tgz]
author: Petteri Aimonen <jpa@git.mail.kapsi.fi> Wed Dec 18 19:12:06 2019 +0200
committer: Petteri Aimonen <jpa@git.mail.kapsi.fi> Wed Dec 18 19:12:06 2019 +0200
tree: 822a70ba0e08606baa73f0b994ccc7ebab71d369
parent: c5b5765ac170fda492d897c4b7fa2d927fa946ed [diff]
diff --git a/pb_common.c b/pb_common.c
index 00bc7d6..4f7e688 100644
--- a/pb_common.c
+++ b/pb_common.c

@@ -245,4 +245,68 @@
 
 }
 
+#ifdef PB_VALIDATE_UTF8
+
+/* This function checks whether a string is valid UTF-8 text.
+ *
+ * Algorithm is adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
+ * Original copyright: Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> 2005-03-30
+ * Licensed under "Short code license", which allows use under MIT license or
+ * any compatible with it.
+ */
+
+bool pb_validate_utf8(const char *str)
+{
+    const pb_byte_t *s = (const pb_byte_t*)str;
+    while (*s)
+    {
+        if (*s < 0x80)
+        {
+            /* 0xxxxxxx */
+            s++;
+        }
+        else if ((s[0] & 0xe0) == 0xc0)
+        {
+            /* 110XXXXx 10xxxxxx */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[0] & 0xfe) == 0xc0)                        /* overlong? */
+                return false;
+            else
+                s += 2;
+        }
+        else if ((s[0] & 0xf0) == 0xe0)
+        {
+            /* 1110XXXX 10Xxxxxx 10xxxxxx */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
+                (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
+                (s[0] == 0xef && s[1] == 0xbf &&
+                (s[2] & 0xfe) == 0xbe))                 /* U+FFFE or U+FFFF? */
+                return false;
+            else
+                s += 3;
+        }
+        else if ((s[0] & 0xf8) == 0xf0)
+        {
+            /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[3] & 0xc0) != 0x80 ||
+                (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+                (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+                return false;
+            else
+                s += 4;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+#endif
 

diff --git a/pb_common.h b/pb_common.h
index b0f53d5..4db04d2 100644
--- a/pb_common.h
+++ b/pb_common.h

@@ -26,6 +26,11 @@
  * Returns false if no such field exists. */
 bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag);
 
+#ifdef PB_VALIDATE_UTF8
+/* Validate UTF-8 text string */
+bool pb_validate_utf8(const char *s);
+#endif
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif

diff --git a/pb_decode.c b/pb_decode.c
index 892e688..052a531 100644
--- a/pb_decode.c
+++ b/pb_decode.c

@@ -1480,53 +1480,6 @@
     return pb_read(stream, dest->bytes, (size_t)size);
 }
 
-#ifdef PB_VALIDATE_UTF8
-
-/* adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c */
-
-static bool pb_validate_utf8(const uint8_t *s)
-{
-  while (*s) {
-    if (*s < 0x80)
-      /* 0xxxxxxx */
-      s++;
-    else if ((s[0] & 0xe0) == 0xc0) {
-      /* 110XXXXx 10xxxxxx */
-      if ((s[1] & 0xc0) != 0x80 ||
-          (s[0] & 0xfe) == 0xc0)                        /* overlong? */
-        return false;
-      else
-        s += 2;
-    } else if ((s[0] & 0xf0) == 0xe0) {
-      /* 1110XXXX 10Xxxxxx 10xxxxxx */
-      if ((s[1] & 0xc0) != 0x80 ||
-          (s[2] & 0xc0) != 0x80 ||
-          (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
-          (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
-          (s[0] == 0xef && s[1] == 0xbf &&
-           (s[2] & 0xfe) == 0xbe))                      /* U+FFFE or U+FFFF? */
-        return false;
-      else
-        s += 3;
-    } else if ((s[0] & 0xf8) == 0xf0) {
-      /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
-      if ((s[1] & 0xc0) != 0x80 ||
-          (s[2] & 0xc0) != 0x80 ||
-          (s[3] & 0xc0) != 0x80 ||
-          (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
-          (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
-        return false;
-      else
-        s += 4;
-    } else
-      return false;
-  }
-
-  return true;
-}
-
-#endif
-
 static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field)
 {
     uint32_t size;
@@ -1560,17 +1513,15 @@
     
     dest[size] = 0;
 
-#ifdef PB_VALIDATE_UTF8
     if (!pb_read(stream, dest, (size_t)size))
         return false;
 
-    if (!pb_validate_utf8((const uint8_t *)dest))
+#ifdef PB_VALIDATE_UTF8
+    if (!pb_validate_utf8((const char*)dest))
         PB_RETURN_ERROR(stream, "invalid utf8");
+#endif
 
     return true;
-#else
-    return pb_read(stream, dest, (size_t)size);
-#endif
 }
 
 static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field)

diff --git a/pb_encode.c b/pb_encode.c
index 4e40505..40f96bc 100644
--- a/pb_encode.c
+++ b/pb_encode.c

@@ -889,6 +889,11 @@
         }
     }
 
+#ifdef PB_VALIDATE_UTF8
+    if (!pb_validate_utf8(str))
+        PB_RETURN_ERROR(stream, "invalid utf8");
+#endif
+
     return pb_encode_string(stream, (const pb_byte_t*)str, size);
 }
 

diff --git a/tests/common_unittests/common_unittests.c b/tests/common_unittests/common_unittests.c
index 1052a95..a2671cd 100644
--- a/tests/common_unittests/common_unittests.c
+++ b/tests/common_unittests/common_unittests.c

@@ -1,3 +1,4 @@
+#define PB_VALIDATE_UTF8
 #include "pb_common.c"
 
 #include <stdio.h>
@@ -108,6 +109,24 @@
         TEST(iter.submessage_index == 0)
     }
 
+    {
+        COMMENT("Test pb_validate_utf8()");
+
+        TEST(pb_validate_utf8("abcdefg"));
+        TEST(pb_validate_utf8("\xc3\xa4\xc3\xa4\x6b\x6b\xc3\xb6\x6e\x65\x6e\x0a"));
+        TEST(!pb_validate_utf8("\xc3\xa4\xc3\xa4\x6b\x6b\xb6\xc3\x6e\x65\x6e\x0a"));
+        TEST(pb_validate_utf8("\xed\x9f\xbf"));
+        TEST(pb_validate_utf8("\xee\x80\x80"));
+        TEST(pb_validate_utf8("\xef\xbf\xbd"));
+        TEST(pb_validate_utf8("\xf4\x8f\xbf\xbf"));
+        TEST(!pb_validate_utf8("a\x80z"));
+        TEST(!pb_validate_utf8("a\xbfz"));
+        TEST(!pb_validate_utf8("a\xfez"));
+        TEST(!pb_validate_utf8("a\xffz"));
+        TEST(!pb_validate_utf8("a\xc0\xafz"));
+        TEST(!pb_validate_utf8("a\xef\xbf\xbez"));
+    }
+
     if (status != 0)
         fprintf(stdout, "\n\nSome tests FAILED!\n");
commit	9c0130ddf429db380f0224ecabd2c98d24e17b6d	[log] [tgz]
author	Petteri Aimonen <jpa@git.mail.kapsi.fi>	Wed Dec 18 19:12:06 2019 +0200
committer	Petteri Aimonen <jpa@git.mail.kapsi.fi>	Wed Dec 18 19:12:06 2019 +0200
tree	822a70ba0e08606baa73f0b994ccc7ebab71d369
parent	c5b5765ac170fda492d897c4b7fa2d927fa946ed [diff]