Remove alignment requirement on CRYPTO_poly1305_finish.

This dates to https://boringssl-review.googlesource.com/2850, which was
done in response to an ARM crash. I assume the ARM crash was due to
poly1305_arm.c casting pointers around, which is technically UB, even on
x86 since C says it is UB to cast pointers if the value would be
unaligned. (Also I believe it's a strict aliasing violation, though the
compilers really ought to give us a sanitizer for it if they're excited
about that optimization.)

Replace with memcpy, which any reasonable compiler would compile the
same on platforms that support unaligned access. ARM does support it
these days, so perhaps the crash came from an older ARM?

Benchmarks showed no difference with this CL.

Change-Id: I022bdb84f95e45c143ad19359f646ee1416d5ae9
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/39344
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c
index 4aff713..004221d 100644
--- a/crypto/poly1305/poly1305_arm.c
+++ b/crypto/poly1305/poly1305_arm.c
@@ -103,7 +103,17 @@
   r->v[8] = y4;
 }
 
-static void fe1305x2_tobytearray(uint8_t *r, fe1305x2 *x) {
+static void store32(uint8_t out[4], uint32_t v) { OPENSSL_memcpy(out, &v, 4); }
+
+// load32 exists to avoid breaking strict aliasing rules in
+// fe1305x2_frombytearray.
+static uint32_t load32(const uint8_t t[4]) {
+  uint32_t tmp;
+  OPENSSL_memcpy(&tmp, t, sizeof(tmp));
+  return tmp;
+}
+
+static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) {
   uint32_t x0 = x->v[0];
   uint32_t x1 = x->v[2];
   uint32_t x2 = x->v[4];
@@ -119,22 +129,13 @@
   x4 += x3 >> 26;
   x3 &= 0x3ffffff;
 
-  *(uint32_t *)r = x0 + (x1 << 26);
-  *(uint32_t *)(r + 4) = (x1 >> 6) + (x2 << 20);
-  *(uint32_t *)(r + 8) = (x2 >> 12) + (x3 << 14);
-  *(uint32_t *)(r + 12) = (x3 >> 18) + (x4 << 8);
+  store32(r, x0 + (x1 << 26));
+  store32(r + 4, (x1 >> 6) + (x2 << 20));
+  store32(r + 8, (x2 >> 12) + (x3 << 14));
+  store32(r + 12, (x3 >> 18) + (x4 << 8));
 }
 
-// load32 exists to avoid breaking strict aliasing rules in
-// fe1305x2_frombytearray.
-static uint32_t load32(uint8_t *t) {
-  uint32_t tmp;
-  OPENSSL_memcpy(&tmp, t, sizeof(tmp));
-  return tmp;
-}
-
-static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x,
-                                   unsigned long long xlen) {
+static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) {
   unsigned i;
   uint8_t t[17];
 
@@ -190,11 +191,11 @@
   fe1305x2 *const precomp = c + 1;
   unsigned int j;
 
-  r->v[1] = r->v[0] = 0x3ffffff & *(uint32_t *)key;
-  r->v[3] = r->v[2] = 0x3ffff03 & ((*(uint32_t *)(key + 3)) >> 2);
-  r->v[5] = r->v[4] = 0x3ffc0ff & ((*(uint32_t *)(key + 6)) >> 4);
-  r->v[7] = r->v[6] = 0x3f03fff & ((*(uint32_t *)(key + 9)) >> 6);
-  r->v[9] = r->v[8] = 0x00fffff & ((*(uint32_t *)(key + 12)) >> 8);
+  r->v[1] = r->v[0] = 0x3ffffff & load32(key);
+  r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2);
+  r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4);
+  r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6);
+  r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8);
 
   for (j = 0; j < 10; j++) {
     h->v[j] = 0;  // XXX: should fast-forward a bit
diff --git a/crypto/poly1305/poly1305_test.cc b/crypto/poly1305/poly1305_test.cc
index 198cca1..930db01 100644
--- a/crypto/poly1305/poly1305_test.cc
+++ b/crypto/poly1305/poly1305_test.cc
@@ -62,8 +62,7 @@
   // Consume the remainder and finish.
   CRYPTO_poly1305_update(&state, in.data() + done, in.size() - done);
 
-  // |CRYPTO_poly1305_finish| requires a 16-byte-aligned output.
-  alignas(16) uint8_t out[16];
+  uint8_t out[16];
   CRYPTO_poly1305_finish(&state, out);
   EXPECT_EQ(Bytes(out), Bytes(mac)) << "SIMD pattern " << excess << " failed.";
 }
@@ -81,8 +80,7 @@
     poly1305_state state;
     CRYPTO_poly1305_init(&state, key.data());
     CRYPTO_poly1305_update(&state, in.data(), in.size());
-    // |CRYPTO_poly1305_finish| requires a 16-byte-aligned output.
-    alignas(16) uint8_t out[16];
+    uint8_t out[16];
     CRYPTO_poly1305_finish(&state, out);
     EXPECT_EQ(Bytes(out), Bytes(mac)) << "Single-shot Poly1305 failed.";
 
@@ -94,6 +92,17 @@
     CRYPTO_poly1305_finish(&state, out);
     EXPECT_EQ(Bytes(out), Bytes(mac)) << "Streaming Poly1305 failed.";
 
+    // Test |CRYPTO_poly1305_init| and |CRYPTO_poly1305_finish| work on
+    // unaligned values.
+    alignas(8) uint8_t unaligned_key[32 + 1];
+    OPENSSL_memcpy(unaligned_key + 1, key.data(), 32);
+    CRYPTO_poly1305_init(&state, unaligned_key + 1);
+    CRYPTO_poly1305_update(&state, in.data(), in.size());
+    alignas(8) uint8_t unaligned_out[16 + 1];
+    CRYPTO_poly1305_finish(&state, unaligned_out + 1);
+    EXPECT_EQ(Bytes(unaligned_out + 1, 16), Bytes(mac))
+        << "Unaligned Poly1305 failed.";
+
     // Test SIMD stress patterns. OpenSSL's AVX2 assembly needs a multiple of
     // four blocks, so test up to three blocks of excess.
     TestSIMD(0, key, in, mac);
diff --git a/crypto/poly1305/poly1305_vec.c b/crypto/poly1305/poly1305_vec.c
index e7b3ae5..7860e0a 100644
--- a/crypto/poly1305/poly1305_vec.c
+++ b/crypto/poly1305/poly1305_vec.c
@@ -27,9 +27,21 @@
 
 #include <emmintrin.h>
 
-#define U8TO64_LE(m) (*(const uint64_t *)(m))
-#define U8TO32_LE(m) (*(const uint32_t *)(m))
-#define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
+static uint32_t load_u32_le(const uint8_t in[4]) {
+  uint32_t ret;
+  OPENSSL_memcpy(&ret, in, 4);
+  return ret;
+}
+
+static uint64_t load_u64_le(const uint8_t in[8]) {
+  uint64_t ret;
+  OPENSSL_memcpy(&ret, in, 8);
+  return ret;
+}
+
+static void store_u64_le(uint8_t out[8], uint64_t v) {
+  OPENSSL_memcpy(out, &v, 8);
+}
 
 typedef __m128i xmmi;
 
@@ -96,8 +108,8 @@
   uint64_t t0, t1;
 
   // clamp key
-  t0 = U8TO64_LE(key + 0);
-  t1 = U8TO64_LE(key + 8);
+  t0 = load_u64_le(key + 0);
+  t1 = load_u64_le(key + 8);
   r0 = t0 & 0xffc0fffffff;
   t0 >>= 44;
   t0 |= t1 << 20;
@@ -115,10 +127,10 @@
   p->R22.d[3] = (uint32_t)(r2 >> 32);
 
   // store pad
-  p->R23.d[1] = U8TO32_LE(key + 16);
-  p->R23.d[3] = U8TO32_LE(key + 20);
-  p->R24.d[1] = U8TO32_LE(key + 24);
-  p->R24.d[3] = U8TO32_LE(key + 28);
+  p->R23.d[1] = load_u32_le(key + 16);
+  p->R23.d[3] = load_u32_le(key + 20);
+  p->R24.d[1] = load_u32_le(key + 24);
+  p->R24.d[3] = load_u32_le(key + 28);
 
   // H = 0
   st->H[0] = _mm_setzero_si128();
@@ -750,8 +762,8 @@
   }
 
 poly1305_donna_atleast16bytes:
-  t0 = U8TO64_LE(m + 0);
-  t1 = U8TO64_LE(m + 8);
+  t0 = load_u64_le(m + 0);
+  t1 = load_u64_le(m + 8);
   h0 += t0 & 0xfffffffffff;
   t0 = shr128_pair(t1, t0, 44);
   h1 += t0 & 0xfffffffffff;
@@ -790,8 +802,8 @@
   OPENSSL_memset(m + leftover, 0, 16 - leftover);
   leftover = 16;
 
-  t0 = U8TO64_LE(m + 0);
-  t1 = U8TO64_LE(m + 8);
+  t0 = load_u64_le(m + 0);
+  t1 = load_u64_le(m + 8);
   h0 += t0 & 0xfffffffffff;
   t0 = shr128_pair(t1, t0, 44);
   h1 += t0 & 0xfffffffffff;
@@ -837,8 +849,8 @@
   t1 = (t1 >> 24);
   h2 += (t1)+c;
 
-  U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
-  U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
+  store_u64_le(mac + 0, ((h0) | (h1 << 44)));
+  store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
 }
 
 #endif  // !OPENSSL_WINDOWS && OPENSSL_X86_64
diff --git a/include/openssl/poly1305.h b/include/openssl/poly1305.h
index cefe2b1..a38ef21 100644
--- a/include/openssl/poly1305.h
+++ b/include/openssl/poly1305.h
@@ -28,19 +28,17 @@
 // authentication tag with the one-time key |key|. Note that |key| is a
 // one-time key and therefore there is no `reset' method because that would
 // enable several messages to be authenticated with the same key.
-OPENSSL_EXPORT void CRYPTO_poly1305_init(poly1305_state* state,
+OPENSSL_EXPORT void CRYPTO_poly1305_init(poly1305_state *state,
                                          const uint8_t key[32]);
 
 // CRYPTO_poly1305_update processes |in_len| bytes from |in|. It can be called
 // zero or more times after poly1305_init.
-OPENSSL_EXPORT void CRYPTO_poly1305_update(poly1305_state* state,
-                                           const uint8_t* in,
-                                           size_t in_len);
+OPENSSL_EXPORT void CRYPTO_poly1305_update(poly1305_state *state,
+                                           const uint8_t *in, size_t in_len);
 
 // CRYPTO_poly1305_finish completes the poly1305 calculation and writes a 16
-// byte authentication tag to |mac|. The |mac| address must be 16-byte
-// aligned.
-OPENSSL_EXPORT void CRYPTO_poly1305_finish(poly1305_state* state,
+// byte authentication tag to |mac|.
+OPENSSL_EXPORT void CRYPTO_poly1305_finish(poly1305_state *state,
                                            uint8_t mac[16]);