Tidy up how ASN1_STRING_print_ex figures out the type.

Between the lookup table, the multiple layers of reuse of the "type"
variable, it is a little hard to follow what's going on with
ASN1_STRING_print_ex. Replace the lookup table with a switch-case
(implicitly handles the bounds check, and we can let the compiler figure
out the best spelling). Then, rather than returning a "character width",
which doen't represent UTF-8, just use the already-defined MBSTRING_*
constants.

(These changes should be covered by the existing ASN1Test.StringPrintEx
test.)

Change-Id: Ie3b2557bfae0f65db969e90cd0c76bc8ade963d4
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52365
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
diff --git a/crypto/asn1/a_strex.c b/crypto/asn1/a_strex.c
index 3732894..56aa033 100644
--- a/crypto/asn1/a_strex.c
+++ b/crypto/asn1/a_strex.c
@@ -56,6 +56,7 @@
 
 #include <openssl/asn1.h>
 
+#include <assert.h>
 #include <ctype.h>
 #include <inttypes.h>
 #include <string.h>
@@ -155,100 +156,98 @@
     return 1;
 }
 
-#define BUF_TYPE_WIDTH_MASK     0x7
-#define BUF_TYPE_CONVUTF8       0x8
-
 /*
  * This function sends each character in a buffer to do_esc_char(). It
  * interprets the content formats and converts to or from UTF8 as
  * appropriate.
  */
 
-static int do_buf(unsigned char *buf, int buflen,
-                  int type, unsigned char flags, char *quotes, BIO *out)
+static int do_buf(const unsigned char *buf, int buflen, int encoding,
+                  int utf8_convert, unsigned char flags, char *quotes, BIO *out)
 {
-    int i, outlen, len, charwidth;
-    unsigned char orflags, *p, *q;
-    uint32_t c;
-    p = buf;
-    q = buf + buflen;
-    outlen = 0;
-    charwidth = type & BUF_TYPE_WIDTH_MASK;
-
-    switch (charwidth) {
-    case 4:
+    /* Reject invalid UCS-4 and UCS-2 lengths without parsing. */
+    switch (encoding) {
+    case MBSTRING_UNIV:
         if (buflen & 3) {
             OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING);
             return -1;
         }
         break;
-    case 2:
+    case MBSTRING_BMP:
         if (buflen & 1) {
             OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING);
             return -1;
         }
         break;
-    default:
-        break;
     }
 
+    const unsigned char *p = buf;
+    const unsigned char *q = buf + buflen;
+    int outlen = 0;
     while (p != q) {
-        if (p == buf && flags & ASN1_STRFLGS_ESC_2253)
+        unsigned char orflags = 0;
+        if (p == buf && flags & ASN1_STRFLGS_ESC_2253) {
             orflags = CHARTYPE_FIRST_ESC_2253;
-        else
-            orflags = 0;
+        }
         /* TODO(davidben): Replace this with |cbs_get_ucs2_be|, etc., to check
-         * for invalid codepoints. */
-        switch (charwidth) {
-        case 4:
+         * for invalid codepoints. Before doing that, enforce it in the parser,
+         * https://crbug.com/boringssl/427, so these error cases are not
+         * reachable from parsed objects. */
+        uint32_t c;
+        switch (encoding) {
+        case MBSTRING_UNIV:
             c = ((uint32_t)*p++) << 24;
             c |= ((uint32_t)*p++) << 16;
             c |= ((uint32_t)*p++) << 8;
             c |= *p++;
             break;
 
-        case 2:
+        case MBSTRING_BMP:
             c = ((uint32_t)*p++) << 8;
             c |= *p++;
             break;
 
-        case 1:
+        case MBSTRING_ASC:
             c = *p++;
             break;
 
-        case 0:
-            i = UTF8_getc(p, buflen, &c);
-            if (i < 0)
+        case MBSTRING_UTF8: {
+            int consumed = UTF8_getc(p, buflen, &c);
+            if (consumed < 0)
                 return -1;      /* Invalid UTF8String */
-            buflen -= i;
-            p += i;
+            buflen -= consumed;
+            p += consumed;
             break;
+        }
+
         default:
-            return -1;          /* invalid width */
+            assert(0);
+            return -1;
         }
         if (p == q && flags & ASN1_STRFLGS_ESC_2253)
             orflags = CHARTYPE_LAST_ESC_2253;
-        if (type & BUF_TYPE_CONVUTF8) {
+        if (utf8_convert) {
             unsigned char utfbuf[6];
             int utflen;
             utflen = UTF8_putc(utfbuf, sizeof utfbuf, c);
-            for (i = 0; i < utflen; i++) {
+            for (int i = 0; i < utflen; i++) {
                 /*
                  * We don't need to worry about setting orflags correctly
                  * because if utflen==1 its value will be correct anyway
                  * otherwise each character will be > 0x7f and so the
                  * character will never be escaped on first and last.
                  */
-                len = do_esc_char(utfbuf[i], (unsigned char)(flags | orflags),
-                                  quotes, out);
-                if (len < 0)
+                int len = do_esc_char(utfbuf[i], flags | orflags, quotes, out);
+                if (len < 0) {
                     return -1;
+                }
                 outlen += len;
             }
         } else {
-            len = do_esc_char(c, (unsigned char)(flags | orflags), quotes, out);
-            if (len < 0)
+            int len = do_esc_char(c, flags | orflags, quotes, out);
+            if (len < 0) {
                 return -1;
+            }
             outlen += len;
         }
     }
@@ -331,22 +330,31 @@
     return outlen + 1;
 }
 
-/*
- * Lookup table to convert tags to character widths, 0 = UTF8 encoded, -1 is
- * used for non string types otherwise it is the number of bytes per
- * character
- */
-
-static const signed char tag2nbyte[] = {
-    -1, -1, -1, -1, -1,         /* 0-4 */
-    -1, -1, -1, -1, -1,         /* 5-9 */
-    -1, -1, 0, -1,              /* 10-13 */
-    -1, -1, -1, -1,             /* 15-17 */
-    1, 1, 1,                    /* 18-20 */
-    -1, 1, 1, 1,                /* 21-24 */
-    -1, 1, -1,                  /* 25-27 */
-    4, -1, 2                    /* 28-30 */
-};
+/* string_type_to_encoding returns the |MBSTRING_*| constant for the encoding
+ * used by the |ASN1_STRING| type |type|, or -1 if |tag| is not a string
+ * type. */
+static int string_type_to_encoding(int type) {
+    /* This function is sometimes passed ASN.1 universal types and sometimes
+     * passed |ASN1_STRING| type values */
+    switch (type) {
+    case V_ASN1_UTF8STRING:
+        return MBSTRING_UTF8;
+    case V_ASN1_NUMERICSTRING:
+    case V_ASN1_PRINTABLESTRING:
+    case V_ASN1_T61STRING:
+    case V_ASN1_IA5STRING:
+    case V_ASN1_UTCTIME:
+    case V_ASN1_GENERALIZEDTIME:
+    case V_ASN1_ISO64STRING:
+        /* |MBSTRING_ASC| refers to Latin-1, not ASCII. */
+        return MBSTRING_ASC;
+    case V_ASN1_UNIVERSALSTRING:
+        return MBSTRING_UNIV;
+    case V_ASN1_BMPSTRING:
+        return MBSTRING_BMP;
+    }
+    return -1;
+}
 
 /*
  * This is the main function, print out an ASN1_STRING taking note of various
@@ -356,79 +364,77 @@
 
 int ASN1_STRING_print_ex(BIO *out, const ASN1_STRING *str, unsigned long lflags)
 {
-    int outlen, len;
-    int type;
-    char quotes;
-    unsigned char flags;
-    quotes = 0;
     /* Keep a copy of escape flags */
-    flags = (unsigned char)(lflags & ESC_FLAGS);
-
-    type = str->type;
-
-    outlen = 0;
-
+    unsigned char flags = (unsigned char)(lflags & ESC_FLAGS);
+    int type = str->type;
+    int outlen = 0;
     if (lflags & ASN1_STRFLGS_SHOW_TYPE) {
-        const char *tagname;
-        tagname = ASN1_tag2str(type);
+        const char *tagname = ASN1_tag2str(type);
         outlen += strlen(tagname);
         if (!maybe_write(out, tagname, outlen) || !maybe_write(out, ":", 1))
             return -1;
         outlen++;
     }
 
-    /* Decide what to do with type, either dump content or display it */
-
-    /* Dump everything */
-    if (lflags & ASN1_STRFLGS_DUMP_ALL)
-        type = -1;
-    /* Ignore the string type */
-    else if (lflags & ASN1_STRFLGS_IGNORE_TYPE)
-        type = 1;
-    else {
-        /* Else determine width based on type */
-        if ((type > 0) && (type < 31))
-            type = tag2nbyte[type];
-        else
-            type = -1;
-        if ((type == -1) && !(lflags & ASN1_STRFLGS_DUMP_UNKNOWN))
-            type = 1;
+    /* Decide what to do with |str|, either dump the contents or display it. */
+    int encoding;
+    if (lflags & ASN1_STRFLGS_DUMP_ALL) {
+        /* Dump everything. */
+        encoding = -1;
+    } else if (lflags & ASN1_STRFLGS_IGNORE_TYPE) {
+        /* Ignore the string type and interpret the contents as Latin-1. */
+        encoding = MBSTRING_ASC;
+    } else {
+        encoding = string_type_to_encoding(type);
+        if (encoding == -1 && (lflags & ASN1_STRFLGS_DUMP_UNKNOWN) == 0) {
+            encoding = MBSTRING_ASC;
+        }
     }
 
-    if (type == -1) {
-        len = do_dump(lflags, out, str);
+    if (encoding == -1) {
+        int len = do_dump(lflags, out, str);
         if (len < 0)
             return -1;
         outlen += len;
         return outlen;
     }
 
+    int utf8_convert = 0;
     if (lflags & ASN1_STRFLGS_UTF8_CONVERT) {
-        /*
-         * Note: if string is UTF8 and we want to convert to UTF8 then we
-         * just interpret it as 1 byte per character to avoid converting
-         * twice.
-         */
-        if (!type)
-            type = 1;
-        else
-            type |= BUF_TYPE_CONVUTF8;
+        /* If the string is UTF-8, skip decoding and just interpret it as 1 byte
+         * per character to avoid converting twice.
+         *
+         * TODO(davidben): This is not quite a valid optimization if the input
+         * was invalid UTF-8. */
+        if (encoding == MBSTRING_UTF8) {
+            encoding = MBSTRING_ASC;
+        } else {
+            utf8_convert = 1;
+        }
     }
 
-    len = do_buf(str->data, str->length, type, flags, &quotes, NULL);
-    if (len < 0)
+    /* Measure the length. */
+    char quotes = 0;
+    int len = do_buf(str->data, str->length, encoding, utf8_convert, flags,
+                     &quotes, NULL);
+    if (len < 0) {
         return -1;
+    }
     outlen += len;
-    if (quotes)
+    if (quotes) {
         outlen += 2;
-    if (!out)
+    }
+    if (!out) {
         return outlen;
-    if (quotes && !maybe_write(out, "\"", 1))
+    }
+
+    /* Encode the value. */
+    if ((quotes && !maybe_write(out, "\"", 1)) ||
+        do_buf(str->data, str->length, encoding, utf8_convert, flags, NULL,
+               out) < 0 ||
+        (quotes && !maybe_write(out, "\"", 1))) {
         return -1;
-    if (do_buf(str->data, str->length, type, flags, NULL, out) < 0)
-        return -1;
-    if (quotes && !maybe_write(out, "\"", 1))
-        return -1;
+    }
     return outlen;
 }
 
@@ -451,22 +457,19 @@
 
 int ASN1_STRING_to_UTF8(unsigned char **out, const ASN1_STRING *in)
 {
-    ASN1_STRING stmp, *str = &stmp;
-    int mbflag, type, ret;
     if (!in)
         return -1;
-    type = in->type;
-    if ((type < 0) || (type > 30))
+    int mbflag = string_type_to_encoding(in->type);
+    if (mbflag == -1) {
+        OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_TAG);
         return -1;
-    mbflag = tag2nbyte[type];
-    if (mbflag == -1)
-        return -1;
-    mbflag |= MBSTRING_FLAG;
+    }
+    ASN1_STRING stmp, *str = &stmp;
     stmp.data = NULL;
     stmp.length = 0;
     stmp.flags = 0;
-    ret = ASN1_mbstring_copy(&str, in->data, in->length, mbflag,
-                             B_ASN1_UTF8STRING);
+    int ret = ASN1_mbstring_copy(&str, in->data, in->length, mbflag,
+                                 B_ASN1_UTF8STRING);
     if (ret < 0)
         return ret;
     *out = stmp.data;
diff --git a/crypto/asn1/asn1_test.cc b/crypto/asn1/asn1_test.cc
index 6e8a1a9..6087ef4 100644
--- a/crypto/asn1/asn1_test.cc
+++ b/crypto/asn1/asn1_test.cc
@@ -871,6 +871,8 @@
       {{0, 0, 0xfd, 0xd5}, V_ASN1_UNIVERSALSTRING, nullptr},
       // BMPString is UCS-2, not UTF-16, so surrogate pairs are invalid.
       {{0xd8, 0, 0xdc, 1}, V_ASN1_BMPSTRING, nullptr},
+      // INTEGERs are stored as strings, but cannot be converted to UTF-8.
+      {{0x01}, V_ASN1_INTEGER, nullptr},
   };
 
   for (const auto &test : kTests) {