Use new encoding functions in ASN1_mbstring_ncopy.

Update-Note: This changes causes BoringSSL to be stricter about handling
Unicode strings:
  · Reject code points outside of Unicode
  · Reject surrogate values
  · Don't allow invalid UTF-8 to pass through when the source claims to
    be UTF-8 already.
  · Drop byte-order marks.

Previously, for example, a UniversalString could contain a large-valued
code point that would cause the UTF-8 encoder to emit invalid UTF-8.

Change-Id: I94d9db7796b70491b04494be84249907ff8fb46c
Reviewed-on: https://boringssl-review.googlesource.com/28325
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/asn1/a_mbstr.c b/crypto/asn1/a_mbstr.c
index a2789ed..1bbcd1b 100644
--- a/crypto/asn1/a_mbstr.c
+++ b/crypto/asn1/a_mbstr.c
@@ -56,23 +56,16 @@
 
 #include <openssl/asn1.h>
 
+#include <limits.h>
 #include <string.h>
 
+#include <openssl/bytestring.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
 
 #include "asn1_locl.h"
+#include "../bytestring/internal.h"
 
-static int traverse_string(const unsigned char *p, int len, int inform,
-                           int (*rfunc) (uint32_t value, void *in),
-                           void *arg);
-static int in_utf8(uint32_t value, void *arg);
-static int out_utf8(uint32_t value, void *arg);
-static int type_str(uint32_t value, void *arg);
-static int cpy_asc(uint32_t value, void *arg);
-static int cpy_bmp(uint32_t value, void *arg);
-static int cpy_univ(uint32_t value, void *arg);
-static int cpy_utf8(uint32_t value, void *arg);
 static int is_printable(uint32_t value);
 
 /*
@@ -90,55 +83,45 @@
     return ASN1_mbstring_ncopy(out, in, len, inform, mask, 0, 0);
 }
 
+OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_BMPSTRING)
+OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_UNIVERSALSTRING)
+OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_UTF8STRING)
+
 int ASN1_mbstring_ncopy(ASN1_STRING **out, const unsigned char *in, int len,
                         int inform, unsigned long mask,
                         long minsize, long maxsize)
 {
     int str_type;
-    int ret;
     char free_out;
-    int outform, outlen = 0;
     ASN1_STRING *dest;
-    unsigned char *p;
-    int nchar;
+    size_t nchar = 0;
     char strbuf[32];
-    int (*cpyfunc) (uint32_t, void *) = NULL;
     if (len == -1)
         len = strlen((const char *)in);
     if (!mask)
         mask = DIRSTRING_TYPE;
 
-    /* First do a string check and work out the number of characters */
+    int (*decode_func)(CBS *, uint32_t*);
+    int error;
     switch (inform) {
-
     case MBSTRING_BMP:
-        if (len & 1) {
-            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING_LENGTH);
-            return -1;
-        }
-        nchar = len >> 1;
+        decode_func = cbs_get_ucs2_be;
+        error = ASN1_R_INVALID_BMPSTRING;
         break;
 
     case MBSTRING_UNIV:
-        if (len & 3) {
-            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING_LENGTH);
-            return -1;
-        }
-        nchar = len >> 2;
+        decode_func = cbs_get_utf32_be;
+        error = ASN1_R_INVALID_UNIVERSALSTRING;
         break;
 
     case MBSTRING_UTF8:
-        nchar = 0;
-        /* This counts the characters and does utf8 syntax checking */
-        ret = traverse_string(in, len, MBSTRING_UTF8, in_utf8, &nchar);
-        if (ret < 0) {
-            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UTF8STRING);
-            return -1;
-        }
+        decode_func = cbs_get_utf8;
+        error = ASN1_R_INVALID_UTF8STRING;
         break;
 
     case MBSTRING_ASC:
-        nchar = len;
+        decode_func = cbs_get_latin1;
+        error = ERR_R_INTERNAL_ERROR;  // Latin-1 inputs are never invalid.
         break;
 
     default:
@@ -146,44 +129,92 @@
         return -1;
     }
 
-    if ((minsize > 0) && (nchar < minsize)) {
+    /* Check |minsize| and |maxsize| and work out the minimal type, if any. */
+    CBS cbs;
+    CBS_init(&cbs, in, len);
+    size_t utf8_len = 0;
+    while (CBS_len(&cbs) != 0) {
+        uint32_t c;
+        if (!decode_func(&cbs, &c)) {
+            OPENSSL_PUT_ERROR(ASN1, error);
+            return -1;
+        }
+        if (nchar == 0 &&
+            (inform == MBSTRING_BMP || inform == MBSTRING_UNIV) &&
+            c == 0xfeff) {
+            /* Reject byte-order mark. We could drop it but that would mean
+             * adding ambiguity around whether a BOM was included or not when
+             * matching strings.
+             *
+             * For a little-endian UCS-2 string, the BOM will appear as 0xfffe
+             * and will be rejected as noncharacter, below. */
+            OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS);
+            return -1;
+        }
+
+        /* Update which output formats are still possible. */
+        if ((mask & B_ASN1_PRINTABLESTRING) && !is_printable(c)) {
+            mask &= ~B_ASN1_PRINTABLESTRING;
+        }
+        if ((mask & B_ASN1_IA5STRING) && (c > 127)) {
+            mask &= ~B_ASN1_IA5STRING;
+        }
+        if ((mask & B_ASN1_T61STRING) && (c > 0xff)) {
+            mask &= ~B_ASN1_T61STRING;
+        }
+        if ((mask & B_ASN1_BMPSTRING) && (c > 0xffff)) {
+            mask &= ~B_ASN1_BMPSTRING;
+        }
+        if (!mask) {
+            OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS);
+            return -1;
+        }
+
+        nchar++;
+        utf8_len += cbb_get_utf8_len(c);
+    }
+
+    if (minsize > 0 && nchar < (size_t)minsize) {
         OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_SHORT);
         BIO_snprintf(strbuf, sizeof strbuf, "%ld", minsize);
         ERR_add_error_data(2, "minsize=", strbuf);
         return -1;
     }
 
-    if ((maxsize > 0) && (nchar > maxsize)) {
+    if (maxsize > 0 && nchar > (size_t)maxsize) {
         OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_LONG);
         BIO_snprintf(strbuf, sizeof strbuf, "%ld", maxsize);
         ERR_add_error_data(2, "maxsize=", strbuf);
         return -1;
     }
 
-    /* Now work out minimal type (if any) */
-    if (traverse_string(in, len, inform, type_str, &mask) < 0) {
-        OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS);
-        return -1;
-    }
-
     /* Now work out output format and string type */
-    outform = MBSTRING_ASC;
-    if (mask & B_ASN1_PRINTABLESTRING)
+    int (*encode_func)(CBB *, uint32_t) = cbb_add_latin1;
+    size_t size_estimate = nchar;
+    int outform = MBSTRING_ASC;
+    if (mask & B_ASN1_PRINTABLESTRING) {
         str_type = V_ASN1_PRINTABLESTRING;
-    else if (mask & B_ASN1_IA5STRING)
+    } else if (mask & B_ASN1_IA5STRING) {
         str_type = V_ASN1_IA5STRING;
-    else if (mask & B_ASN1_T61STRING)
+    } else if (mask & B_ASN1_T61STRING) {
         str_type = V_ASN1_T61STRING;
-    else if (mask & B_ASN1_BMPSTRING) {
+    } else if (mask & B_ASN1_BMPSTRING) {
         str_type = V_ASN1_BMPSTRING;
         outform = MBSTRING_BMP;
+        encode_func = cbb_add_ucs2_be;
+        size_estimate = 2 * nchar;
     } else if (mask & B_ASN1_UNIVERSALSTRING) {
         str_type = V_ASN1_UNIVERSALSTRING;
+        encode_func = cbb_add_utf32_be;
+        size_estimate = 4 * nchar;
         outform = MBSTRING_UNIV;
     } else {
         str_type = V_ASN1_UTF8STRING;
         outform = MBSTRING_UTF8;
+        encode_func = cbb_add_utf8;
+        size_estimate = utf8_len;
     }
+
     if (!out)
         return str_type;
     if (*out) {
@@ -204,6 +235,7 @@
         }
         *out = dest;
     }
+
     /* If both the same type just copy across */
     if (inform == outform) {
         if (!ASN1_STRING_set(dest, in, len)) {
@@ -213,179 +245,41 @@
         return str_type;
     }
 
-    /* Work out how much space the destination will need */
-    switch (outform) {
-    case MBSTRING_ASC:
-        outlen = nchar;
-        cpyfunc = cpy_asc;
-        break;
-
-    case MBSTRING_BMP:
-        outlen = nchar << 1;
-        cpyfunc = cpy_bmp;
-        break;
-
-    case MBSTRING_UNIV:
-        outlen = nchar << 2;
-        cpyfunc = cpy_univ;
-        break;
-
-    case MBSTRING_UTF8:
-        outlen = 0;
-        traverse_string(in, len, inform, out_utf8, &outlen);
-        cpyfunc = cpy_utf8;
-        break;
-    }
-    if (!(p = OPENSSL_malloc(outlen + 1))) {
-        if (free_out)
-            ASN1_STRING_free(dest);
+    CBB cbb;
+    if (!CBB_init(&cbb, size_estimate + 1)) {
         OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE);
-        return -1;
+        goto err;
     }
-    dest->length = outlen;
-    dest->data = p;
-    p[outlen] = 0;
-    traverse_string(in, len, inform, cpyfunc, &p);
+    CBS_init(&cbs, in, len);
+    while (CBS_len(&cbs) != 0) {
+        uint32_t c;
+        if (!decode_func(&cbs, &c) ||
+            !encode_func(&cbb, c)) {
+            OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR);
+            goto err;
+        }
+    }
+    uint8_t *data = NULL;
+    size_t data_len;
+    if (/* OpenSSL historically NUL-terminated this value with a single byte,
+         * even for |MBSTRING_BMP| and |MBSTRING_UNIV|. */
+        !CBB_add_u8(&cbb, 0) ||
+        !CBB_finish(&cbb, &data, &data_len) ||
+        data_len < 1 ||
+        data_len > INT_MAX) {
+        OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR);
+        OPENSSL_free(data);
+        goto err;
+    }
+    dest->length = (int)(data_len - 1);
+    dest->data = data;
     return str_type;
-}
 
-/*
- * This function traverses a string and passes the value of each character to
- * an optional function along with a void * argument.
- */
-
-static int traverse_string(const unsigned char *p, int len, int inform,
-                           int (*rfunc) (uint32_t value, void *in),
-                           void *arg)
-{
-    uint32_t value;
-    int ret;
-    while (len) {
-        if (inform == MBSTRING_ASC) {
-            value = *p++;
-            len--;
-        } else if (inform == MBSTRING_BMP) {
-            value = *p++ << 8;
-            value |= *p++;
-            len -= 2;
-        } else if (inform == MBSTRING_UNIV) {
-            value = ((uint32_t)*p++) << 24;
-            value |= ((uint32_t)*p++) << 16;
-            value |= *p++ << 8;
-            value |= *p++;
-            len -= 4;
-        } else {
-            ret = UTF8_getc(p, len, &value);
-            if (ret < 0)
-                return -1;
-            len -= ret;
-            p += ret;
-        }
-        if (rfunc) {
-            ret = rfunc(value, arg);
-            if (ret <= 0)
-                return ret;
-        }
-    }
-    return 1;
-}
-
-/* Various utility functions for traverse_string */
-
-/* Just count number of characters */
-
-static int in_utf8(uint32_t value, void *arg)
-{
-    int *nchar;
-    nchar = arg;
-    (*nchar)++;
-    return 1;
-}
-
-/* Determine size of output as a UTF8 String */
-
-static int out_utf8(uint32_t value, void *arg)
-{
-    int *outlen;
-    outlen = arg;
-    *outlen += UTF8_putc(NULL, -1, value);
-    return 1;
-}
-
-/*
- * Determine the "type" of a string: check each character against a supplied
- * "mask".
- */
-
-static int type_str(uint32_t value, void *arg)
-{
-    unsigned long types;
-    types = *((unsigned long *)arg);
-    if ((types & B_ASN1_PRINTABLESTRING) && !is_printable(value))
-        types &= ~B_ASN1_PRINTABLESTRING;
-    if ((types & B_ASN1_IA5STRING) && (value > 127))
-        types &= ~B_ASN1_IA5STRING;
-    if ((types & B_ASN1_T61STRING) && (value > 0xff))
-        types &= ~B_ASN1_T61STRING;
-    if ((types & B_ASN1_BMPSTRING) && (value > 0xffff))
-        types &= ~B_ASN1_BMPSTRING;
-    if (!types)
-        return -1;
-    *((unsigned long *)arg) = types;
-    return 1;
-}
-
-/* Copy one byte per character ASCII like strings */
-
-static int cpy_asc(uint32_t value, void *arg)
-{
-    unsigned char **p, *q;
-    p = arg;
-    q = *p;
-    *q = (unsigned char)value;
-    (*p)++;
-    return 1;
-}
-
-/* Copy two byte per character BMPStrings */
-
-static int cpy_bmp(uint32_t value, void *arg)
-{
-    unsigned char **p, *q;
-    p = arg;
-    q = *p;
-    *q++ = (unsigned char)((value >> 8) & 0xff);
-    *q = (unsigned char)(value & 0xff);
-    *p += 2;
-    return 1;
-}
-
-/* Copy four byte per character UniversalStrings */
-
-static int cpy_univ(uint32_t value, void *arg)
-{
-    unsigned char **p, *q;
-    p = arg;
-    q = *p;
-    *q++ = (unsigned char)((value >> 24) & 0xff);
-    *q++ = (unsigned char)((value >> 16) & 0xff);
-    *q++ = (unsigned char)((value >> 8) & 0xff);
-    *q = (unsigned char)(value & 0xff);
-    *p += 4;
-    return 1;
-}
-
-/* Copy to a UTF8String */
-
-static int cpy_utf8(uint32_t value, void *arg)
-{
-    unsigned char **p;
-    int ret;
-    p = arg;
-    /* We already know there is enough room so pass 0xff as the length */
-    ret = UTF8_putc(*p, 0xff, value);
-    *p += ret;
-    return 1;
+ err:
+    if (free_out)
+        ASN1_STRING_free(dest);
+    CBB_cleanup(&cbb);
+    return -1;
 }
 
 /* Return 1 if the character is permitted in a PrintableString */
diff --git a/crypto/err/asn1.errordata b/crypto/err/asn1.errordata
index 56cbbe52..271561b 100644
--- a/crypto/err/asn1.errordata
+++ b/crypto/err/asn1.errordata
@@ -40,14 +40,14 @@
 ASN1,139,INTEGER_NOT_ASCII_FORMAT
 ASN1,140,INTEGER_TOO_LARGE_FOR_LONG
 ASN1,141,INVALID_BIT_STRING_BITS_LEFT
-ASN1,142,INVALID_BMPSTRING_LENGTH
+ASN1,142,INVALID_BMPSTRING
 ASN1,143,INVALID_DIGIT
 ASN1,144,INVALID_MODIFIER
 ASN1,145,INVALID_NUMBER
 ASN1,146,INVALID_OBJECT_ENCODING
 ASN1,147,INVALID_SEPARATOR
 ASN1,148,INVALID_TIME_FORMAT
-ASN1,149,INVALID_UNIVERSALSTRING_LENGTH
+ASN1,149,INVALID_UNIVERSALSTRING
 ASN1,150,INVALID_UTF8STRING
 ASN1,151,LIST_ERROR
 ASN1,152,MISSING_ASN1_EOS
diff --git a/crypto/test/test_util.cc b/crypto/test/test_util.cc
index 493b124..4ad777f 100644
--- a/crypto/test/test_util.cc
+++ b/crypto/test/test_util.cc
@@ -30,6 +30,10 @@
 }
 
 std::ostream &operator<<(std::ostream &os, const Bytes &in) {
+  if (in.len == 0) {
+    return os << "<empty Bytes>";
+  }
+
   // Print a byte slice as hex.
   static const char hex[] = "0123456789abcdef";
   for (size_t i = 0; i < in.len; i++) {
diff --git a/crypto/x509/a_strex.c b/crypto/x509/a_strex.c
index c0c346d..6dc183a 100644
--- a/crypto/x509/a_strex.c
+++ b/crypto/x509/a_strex.c
@@ -189,13 +189,13 @@
     switch (charwidth) {
     case 4:
         if (buflen & 3) {
-            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING_LENGTH);
+            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING);
             return -1;
         }
         break;
     case 2:
         if (buflen & 1) {
-            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING_LENGTH);
+            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING);
             return -1;
         }
         break;
diff --git a/crypto/x509/x509_test.cc b/crypto/x509/x509_test.cc
index 7615324..9c99aee 100644
--- a/crypto/x509/x509_test.cc
+++ b/crypto/x509/x509_test.cc
@@ -20,8 +20,8 @@
 
 #include <openssl/bio.h>
 #include <openssl/bytestring.h>
-#include <openssl/curve25519.h>
 #include <openssl/crypto.h>
+#include <openssl/curve25519.h>
 #include <openssl/digest.h>
 #include <openssl/err.h>
 #include <openssl/pem.h>
@@ -30,6 +30,7 @@
 #include <openssl/x509v3.h>
 
 #include "../internal.h"
+#include "../test/test_util.h"
 
 
 std::string GetTestData(const char *path);
@@ -1283,3 +1284,62 @@
   EXPECT_EQ(X509_NAME_ENTRY_set(X509_NAME_get_entry(name.get(), 1)), 1);
   EXPECT_EQ(X509_NAME_ENTRY_set(X509_NAME_get_entry(name.get(), 2)), 2);
 }
+
+TEST(X509Test, StringDecoding) {
+  static const struct {
+    std::vector<uint8_t> in;
+    int type;
+    const char *expected;
+  } kTests[] = {
+      // Non-minimal, two-byte UTF-8.
+      {{0xc0, 0x81}, V_ASN1_UTF8STRING, nullptr},
+      // Non-minimal, three-byte UTF-8.
+      {{0xe0, 0x80, 0x81}, V_ASN1_UTF8STRING, nullptr},
+      // Non-minimal, four-byte UTF-8.
+      {{0xf0, 0x80, 0x80, 0x81}, V_ASN1_UTF8STRING, nullptr},
+      // Truncated, four-byte UTF-8.
+      {{0xf0, 0x80, 0x80}, V_ASN1_UTF8STRING, nullptr},
+      // Low-surrogate value.
+      {{0xed, 0xa0, 0x80}, V_ASN1_UTF8STRING, nullptr},
+      // High-surrogate value.
+      {{0xed, 0xb0, 0x81}, V_ASN1_UTF8STRING, nullptr},
+      // Initial BOMs should be rejected from UCS-2 and UCS-4.
+      {{0xfe, 0xff, 0, 88}, V_ASN1_BMPSTRING, nullptr},
+      {{0, 0, 0xfe, 0xff, 0, 0, 0, 88}, V_ASN1_UNIVERSALSTRING, nullptr},
+      // Otherwise, BOMs should pass through.
+      {{0, 88, 0xfe, 0xff}, V_ASN1_BMPSTRING, "X\xef\xbb\xbf"},
+      {{0, 0, 0, 88, 0, 0, 0xfe, 0xff}, V_ASN1_UNIVERSALSTRING,
+       "X\xef\xbb\xbf"},
+      // The maximum code-point should pass though.
+      {{0, 16, 0xff, 0xfd}, V_ASN1_UNIVERSALSTRING, "\xf4\x8f\xbf\xbd"},
+      // Values outside the Unicode space should not.
+      {{0, 17, 0, 0}, V_ASN1_UNIVERSALSTRING, nullptr},
+      // Non-characters should be rejected.
+      {{0, 1, 0xff, 0xff}, V_ASN1_UNIVERSALSTRING, nullptr},
+      {{0, 1, 0xff, 0xfe}, V_ASN1_UNIVERSALSTRING, nullptr},
+      {{0, 0, 0xfd, 0xd5}, V_ASN1_UNIVERSALSTRING, nullptr},
+      // BMPString is UCS-2, not UTF-16, so surrogate pairs are invalid.
+      {{0xd8, 0, 0xdc, 1}, V_ASN1_BMPSTRING, nullptr},
+  };
+
+  for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kTests); i++) {
+    SCOPED_TRACE(i);
+    const auto& test = kTests[i];
+    ASN1_STRING s;
+    s.type = test.type;
+    s.data = const_cast<uint8_t*>(test.in.data());
+    s.length = test.in.size();
+
+    uint8_t *utf8;
+    const int utf8_len = ASN1_STRING_to_UTF8(&utf8, &s);
+    EXPECT_EQ(utf8_len < 0, test.expected == nullptr);
+    if (utf8_len >= 0) {
+      if (test.expected != nullptr) {
+        EXPECT_EQ(Bytes(test.expected), Bytes(utf8, utf8_len));
+      }
+      OPENSSL_free(utf8);
+    } else {
+      ERR_clear_error();
+    }
+  }
+}
diff --git a/include/openssl/asn1.h b/include/openssl/asn1.h
index f2e92a7..f7b6b86 100644
--- a/include/openssl/asn1.h
+++ b/include/openssl/asn1.h
@@ -152,6 +152,9 @@
 /* For use with ASN1_mbstring_copy() */
 #define MBSTRING_FLAG		0x1000
 #define MBSTRING_UTF8		(MBSTRING_FLAG)
+/* |MBSTRING_ASC| refers to Latin-1, not ASCII. It is used with TeletexString
+ * which, in turn, is treated as Latin-1 rather than T.61 by OpenSSL and most
+ * other software. */
 #define MBSTRING_ASC		(MBSTRING_FLAG|1)
 #define MBSTRING_BMP		(MBSTRING_FLAG|2)
 #define MBSTRING_UNIV		(MBSTRING_FLAG|4)
@@ -926,14 +929,14 @@
 #define ASN1_R_INTEGER_NOT_ASCII_FORMAT 139
 #define ASN1_R_INTEGER_TOO_LARGE_FOR_LONG 140
 #define ASN1_R_INVALID_BIT_STRING_BITS_LEFT 141
-#define ASN1_R_INVALID_BMPSTRING_LENGTH 142
+#define ASN1_R_INVALID_BMPSTRING 142
 #define ASN1_R_INVALID_DIGIT 143
 #define ASN1_R_INVALID_MODIFIER 144
 #define ASN1_R_INVALID_NUMBER 145
 #define ASN1_R_INVALID_OBJECT_ENCODING 146
 #define ASN1_R_INVALID_SEPARATOR 147
 #define ASN1_R_INVALID_TIME_FORMAT 148
-#define ASN1_R_INVALID_UNIVERSALSTRING_LENGTH 149
+#define ASN1_R_INVALID_UNIVERSALSTRING 149
 #define ASN1_R_INVALID_UTF8STRING 150
 #define ASN1_R_LIST_ERROR 151
 #define ASN1_R_MISSING_ASN1_EOS 152