Move pb_validate_utf8() to pb_common.c (#437)

The validation function can be useful for user code also.
Also made the encoding check validity also, because it's
better to catch invalid messages in the transmitting end.

Added unittests for the validation.
diff --git a/pb_common.c b/pb_common.c
index 00bc7d6..4f7e688 100644
--- a/pb_common.c
+++ b/pb_common.c
@@ -245,4 +245,68 @@
 
 }
 
+#ifdef PB_VALIDATE_UTF8
+
+/* This function checks whether a string is valid UTF-8 text.
+ *
+ * Algorithm is adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
+ * Original copyright: Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> 2005-03-30
+ * Licensed under "Short code license", which allows use under MIT license or
+ * any compatible with it.
+ */
+
+bool pb_validate_utf8(const char *str)
+{
+    const pb_byte_t *s = (const pb_byte_t*)str;
+    while (*s)
+    {
+        if (*s < 0x80)
+        {
+            /* 0xxxxxxx */
+            s++;
+        }
+        else if ((s[0] & 0xe0) == 0xc0)
+        {
+            /* 110XXXXx 10xxxxxx */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[0] & 0xfe) == 0xc0)                        /* overlong? */
+                return false;
+            else
+                s += 2;
+        }
+        else if ((s[0] & 0xf0) == 0xe0)
+        {
+            /* 1110XXXX 10Xxxxxx 10xxxxxx */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
+                (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
+                (s[0] == 0xef && s[1] == 0xbf &&
+                (s[2] & 0xfe) == 0xbe))                 /* U+FFFE or U+FFFF? */
+                return false;
+            else
+                s += 3;
+        }
+        else if ((s[0] & 0xf8) == 0xf0)
+        {
+            /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[3] & 0xc0) != 0x80 ||
+                (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+                (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+                return false;
+            else
+                s += 4;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+#endif
 
diff --git a/pb_common.h b/pb_common.h
index b0f53d5..4db04d2 100644
--- a/pb_common.h
+++ b/pb_common.h
@@ -26,6 +26,11 @@
  * Returns false if no such field exists. */
 bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag);
 
+#ifdef PB_VALIDATE_UTF8
+/* Validate UTF-8 text string */
+bool pb_validate_utf8(const char *s);
+#endif
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/pb_decode.c b/pb_decode.c
index 892e688..052a531 100644
--- a/pb_decode.c
+++ b/pb_decode.c
@@ -1480,53 +1480,6 @@
     return pb_read(stream, dest->bytes, (size_t)size);
 }
 
-#ifdef PB_VALIDATE_UTF8
-
-/* adapted from https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c */
-
-static bool pb_validate_utf8(const uint8_t *s)
-{
-  while (*s) {
-    if (*s < 0x80)
-      /* 0xxxxxxx */
-      s++;
-    else if ((s[0] & 0xe0) == 0xc0) {
-      /* 110XXXXx 10xxxxxx */
-      if ((s[1] & 0xc0) != 0x80 ||
-          (s[0] & 0xfe) == 0xc0)                        /* overlong? */
-        return false;
-      else
-        s += 2;
-    } else if ((s[0] & 0xf0) == 0xe0) {
-      /* 1110XXXX 10Xxxxxx 10xxxxxx */
-      if ((s[1] & 0xc0) != 0x80 ||
-          (s[2] & 0xc0) != 0x80 ||
-          (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||    /* overlong? */
-          (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||    /* surrogate? */
-          (s[0] == 0xef && s[1] == 0xbf &&
-           (s[2] & 0xfe) == 0xbe))                      /* U+FFFE or U+FFFF? */
-        return false;
-      else
-        s += 3;
-    } else if ((s[0] & 0xf8) == 0xf0) {
-      /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
-      if ((s[1] & 0xc0) != 0x80 ||
-          (s[2] & 0xc0) != 0x80 ||
-          (s[3] & 0xc0) != 0x80 ||
-          (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
-          (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
-        return false;
-      else
-        s += 4;
-    } else
-      return false;
-  }
-
-  return true;
-}
-
-#endif
-
 static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_iter_t *field)
 {
     uint32_t size;
@@ -1560,17 +1513,15 @@
     
     dest[size] = 0;
 
-#ifdef PB_VALIDATE_UTF8
     if (!pb_read(stream, dest, (size_t)size))
         return false;
 
-    if (!pb_validate_utf8((const uint8_t *)dest))
+#ifdef PB_VALIDATE_UTF8
+    if (!pb_validate_utf8((const char*)dest))
         PB_RETURN_ERROR(stream, "invalid utf8");
+#endif
 
     return true;
-#else
-    return pb_read(stream, dest, (size_t)size);
-#endif
 }
 
 static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_iter_t *field)
diff --git a/pb_encode.c b/pb_encode.c
index 4e40505..40f96bc 100644
--- a/pb_encode.c
+++ b/pb_encode.c
@@ -889,6 +889,11 @@
         }
     }
 
+#ifdef PB_VALIDATE_UTF8
+    if (!pb_validate_utf8(str))
+        PB_RETURN_ERROR(stream, "invalid utf8");
+#endif
+
     return pb_encode_string(stream, (const pb_byte_t*)str, size);
 }
 
diff --git a/tests/common_unittests/common_unittests.c b/tests/common_unittests/common_unittests.c
index 1052a95..a2671cd 100644
--- a/tests/common_unittests/common_unittests.c
+++ b/tests/common_unittests/common_unittests.c
@@ -1,3 +1,4 @@
+#define PB_VALIDATE_UTF8
 #include "pb_common.c"
 
 #include <stdio.h>
@@ -108,6 +109,24 @@
         TEST(iter.submessage_index == 0)
     }
 
+    {
+        COMMENT("Test pb_validate_utf8()");
+
+        TEST(pb_validate_utf8("abcdefg"));
+        TEST(pb_validate_utf8("\xc3\xa4\xc3\xa4\x6b\x6b\xc3\xb6\x6e\x65\x6e\x0a"));
+        TEST(!pb_validate_utf8("\xc3\xa4\xc3\xa4\x6b\x6b\xb6\xc3\x6e\x65\x6e\x0a"));
+        TEST(pb_validate_utf8("\xed\x9f\xbf"));
+        TEST(pb_validate_utf8("\xee\x80\x80"));
+        TEST(pb_validate_utf8("\xef\xbf\xbd"));
+        TEST(pb_validate_utf8("\xf4\x8f\xbf\xbf"));
+        TEST(!pb_validate_utf8("a\x80z"));
+        TEST(!pb_validate_utf8("a\xbfz"));
+        TEST(!pb_validate_utf8("a\xfez"));
+        TEST(!pb_validate_utf8("a\xffz"));
+        TEST(!pb_validate_utf8("a\xc0\xafz"));
+        TEST(!pb_validate_utf8("a\xef\xbf\xbez"));
+    }
+
     if (status != 0)
         fprintf(stdout, "\n\nSome tests FAILED!\n");