Simplify tls_cbc.c slightly.

This removes the now unnecessary virtual calls. Benchmark differences are
mostly positive but probably noise.

Before:
Did 839000 AES-128-CBC-SHA1 (16 bytes) open operations in 2000497us (6.7 MB/sec)
Did 623000 AES-128-CBC-SHA1 (256 bytes) open operations in 2000409us (79.7 MB/sec)
Did 434000 AES-128-CBC-SHA1 (1350 bytes) open operations in 2002909us (292.5 MB/sec)
Did 146000 AES-128-CBC-SHA1 (8192 bytes) open operations in 2000785us (597.8 MB/sec)
Did 82000 AES-128-CBC-SHA1 (16384 bytes) open operations in 2014268us (667.0 MB/sec)

After:
Did 866000 AES-128-CBC-SHA1 (16 bytes) open operations in 2000697us (6.9 MB/sec) [+3.2%]
Did 616000 AES-128-CBC-SHA1 (256 bytes) open operations in 2001403us (78.8 MB/sec) [-1.2%]
Did 432000 AES-128-CBC-SHA1 (1350 bytes) open operations in 2003898us (291.0 MB/sec) [-0.5%]
Did 148000 AES-128-CBC-SHA1 (8192 bytes) open operations in 2006042us (604.4 MB/sec) [+1.1%]
Did 83000 AES-128-CBC-SHA1 (16384 bytes) open operations in 2010885us (676.3 MB/sec) [+1.4%]

Change-Id: I735e99296ca9a1771518c622b8e7e6979a0d30bc
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/46685
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/cipher_extra/tls_cbc.c b/crypto/cipher_extra/tls_cbc.c
index 5e97a1c..f446362 100644
--- a/crypto/cipher_extra/tls_cbc.c
+++ b/crypto/cipher_extra/tls_cbc.c
@@ -193,74 +193,19 @@
     *((p)++) = (uint8_t)((n));       \
   } while (0)
 
-// u64toBE serialises an unsigned, 64-bit number (n) as eight bytes at (p) in
-// big-endian order. The value of p is advanced by eight.
-#define u64toBE(n, p)                \
-  do {                               \
-    *((p)++) = (uint8_t)((n) >> 56); \
-    *((p)++) = (uint8_t)((n) >> 48); \
-    *((p)++) = (uint8_t)((n) >> 40); \
-    *((p)++) = (uint8_t)((n) >> 32); \
-    *((p)++) = (uint8_t)((n) >> 24); \
-    *((p)++) = (uint8_t)((n) >> 16); \
-    *((p)++) = (uint8_t)((n) >> 8);  \
-    *((p)++) = (uint8_t)((n));       \
-  } while (0)
-
-typedef union {
-  SHA_CTX sha1;
-  SHA256_CTX sha256;
-  SHA512_CTX sha512;
-} HASH_CTX;
-
-static void tls1_sha1_transform(HASH_CTX *ctx, const uint8_t *block) {
-  SHA1_Transform(&ctx->sha1, block);
-}
-
-static void tls1_sha256_transform(HASH_CTX *ctx, const uint8_t *block) {
-  SHA256_Transform(&ctx->sha256, block);
-}
-
-static void tls1_sha512_transform(HASH_CTX *ctx, const uint8_t *block) {
-  SHA512_Transform(&ctx->sha512, block);
-}
-
 // These functions serialize the state of a hash and thus perform the standard
 // "final" operation without adding the padding and length that such a function
 // typically does.
-static void tls1_sha1_final_raw(HASH_CTX *ctx, uint8_t *md_out) {
-  SHA_CTX *sha1 = &ctx->sha1;
-  u32toBE(sha1->h[0], md_out);
-  u32toBE(sha1->h[1], md_out);
-  u32toBE(sha1->h[2], md_out);
-  u32toBE(sha1->h[3], md_out);
-  u32toBE(sha1->h[4], md_out);
-}
-
-static void tls1_sha256_final_raw(HASH_CTX *ctx, uint8_t *md_out) {
-  SHA256_CTX *sha256 = &ctx->sha256;
-  for (unsigned i = 0; i < 8; i++) {
-    u32toBE(sha256->h[i], md_out);
-  }
-}
-
-static void tls1_sha512_final_raw(HASH_CTX *ctx, uint8_t *md_out) {
-  SHA512_CTX *sha512 = &ctx->sha512;
-  for (unsigned i = 0; i < 8; i++) {
-    u64toBE(sha512->h[i], md_out);
-  }
+static void tls1_sha1_final_raw(SHA_CTX *ctx, uint8_t *md_out) {
+  u32toBE(ctx->h[0], md_out);
+  u32toBE(ctx->h[1], md_out);
+  u32toBE(ctx->h[2], md_out);
+  u32toBE(ctx->h[3], md_out);
+  u32toBE(ctx->h[4], md_out);
 }
 
 int EVP_tls_cbc_record_digest_supported(const EVP_MD *md) {
-  switch (EVP_MD_type(md)) {
-    case NID_sha1:
-    case NID_sha256:
-    case NID_sha384:
-      return 1;
-
-    default:
-      return 0;
-  }
+  return EVP_MD_type(md) == NID_sha1;
 }
 
 int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out,
@@ -269,14 +214,6 @@
                               size_t data_plus_mac_plus_padding_size,
                               const uint8_t *mac_secret,
                               unsigned mac_secret_length) {
-  HASH_CTX md_state;
-  void (*md_final_raw)(HASH_CTX *ctx, uint8_t *md_out);
-  void (*md_transform)(HASH_CTX *ctx, const uint8_t *block);
-  unsigned md_size, md_block_size = 64, md_block_shift = 6;
-  // md_length_size is the number of bytes in the length field that terminates
-  // the hash.
-  unsigned md_length_size = 8;
-
   // Bound the acceptable input so we can forget about many possible overflows
   // later in this function. This is redundant with the record size limits in
   // TLS.
@@ -285,32 +222,7 @@
     return 0;
   }
 
-  switch (EVP_MD_type(md)) {
-    case NID_sha1:
-      SHA1_Init(&md_state.sha1);
-      md_final_raw = tls1_sha1_final_raw;
-      md_transform = tls1_sha1_transform;
-      md_size = SHA_DIGEST_LENGTH;
-      break;
-
-    case NID_sha256:
-      SHA256_Init(&md_state.sha256);
-      md_final_raw = tls1_sha256_final_raw;
-      md_transform = tls1_sha256_transform;
-      md_size = SHA256_DIGEST_LENGTH;
-      break;
-
-    case NID_sha384:
-      SHA384_Init(&md_state.sha512);
-      md_final_raw = tls1_sha512_final_raw;
-      md_transform = tls1_sha512_transform;
-      md_size = SHA384_DIGEST_LENGTH;
-      md_block_size = 128;
-      md_block_shift = 7;
-      md_length_size = 16;
-      break;
-
-    default:
+  if (EVP_MD_type(md) != NID_sha1) {
       // EVP_tls_cbc_record_digest_supported should have been called first to
       // check that the hash function is supported.
       assert(0);
@@ -318,6 +230,16 @@
       return 0;
   }
 
+  // TODO(davidben): Further simplify this logic, now that we're down to one
+  // hash function.
+
+  SHA_CTX md_state;
+  SHA1_Init(&md_state);
+  unsigned md_size = SHA_DIGEST_LENGTH, md_block_size = 64, md_block_shift = 6;
+  // md_length_size is the number of bytes in the length field that terminates
+  // the hash.
+  unsigned md_length_size = 8;
+
   assert(md_length_size <= MAX_HASH_BIT_COUNT_BYTES);
   assert(md_block_size <= MAX_HASH_BLOCK_SIZE);
   assert(md_block_size == (1u << md_block_shift));
@@ -393,7 +315,7 @@
     hmac_pad[i] ^= 0x36;
   }
 
-  md_transform(&md_state, hmac_pad);
+  SHA1_Transform(&md_state, hmac_pad);
 
   // The length check means |bits| fits in four bytes.
   uint8_t length_bytes[MAX_HASH_BIT_COUNT_BYTES];
@@ -408,9 +330,9 @@
     uint8_t first_block[MAX_HASH_BLOCK_SIZE];
     OPENSSL_memcpy(first_block, header, 13);
     OPENSSL_memcpy(first_block + 13, data, md_block_size - 13);
-    md_transform(&md_state, first_block);
+    SHA1_Transform(&md_state, first_block);
     for (size_t i = 1; i < k / md_block_size; i++) {
-      md_transform(&md_state, data + md_block_size * i - 13);
+      SHA1_Transform(&md_state, data + md_block_size * i - 13);
     }
   }
 
@@ -461,32 +383,25 @@
       block[j] = b;
     }
 
-    md_transform(&md_state, block);
-    md_final_raw(&md_state, block);
+    SHA1_Transform(&md_state, block);
+    tls1_sha1_final_raw(&md_state, block);
     // If this is index_b, copy the hash value to |mac_out|.
     for (size_t j = 0; j < md_size; j++) {
       mac_out[j] |= block[j] & is_block_b;
     }
   }
 
-  EVP_MD_CTX md_ctx;
-  EVP_MD_CTX_init(&md_ctx);
-  if (!EVP_DigestInit_ex(&md_ctx, md, NULL /* engine */)) {
-    EVP_MD_CTX_cleanup(&md_ctx);
-    return 0;
-  }
+  SHA_CTX md_ctx;
+  SHA1_Init(&md_ctx);
 
   // Complete the HMAC in the standard manner.
   for (size_t i = 0; i < md_block_size; i++) {
     hmac_pad[i] ^= 0x6a;
   }
 
-  EVP_DigestUpdate(&md_ctx, hmac_pad, md_block_size);
-  EVP_DigestUpdate(&md_ctx, mac_out, md_size);
-  unsigned md_out_size_u;
-  EVP_DigestFinal(&md_ctx, md_out, &md_out_size_u);
-  *md_out_size = md_out_size_u;
-  EVP_MD_CTX_cleanup(&md_ctx);
-
+  SHA1_Update(&md_ctx, hmac_pad, md_block_size);
+  SHA1_Update(&md_ctx, mac_out, md_size);
+  SHA1_Final(md_out, &md_ctx);
+  *md_out_size = md_size;
   return 1;
 }