Add an ABI testing framework.

Dear reader, I must apologize in advance. This CL contains the following:

- A new 256-line perlasm file with non-trivial perl bits and a dual-ABI
  variadic function caller.

- C preprocessor gymnastics, with variadic macros and fun facts about
  __VA_ARGS__'s behavior on empty argument lists.

- C++ template gymnastics, including variadic arguments, template
  specialization, std::enable_if, and machinery to control template argument
  deduction.

Enjoy.

This tests that our assembly functions correctly honor platform ABI
conventions. Right now this only tests callee-saved registers, but it should be
extendable to SEH/CFI unwind testing with single-step debugging APIs.
Register-checking does not involve anything funny and should be compatible with
SDE. (The future unwind testing is unlikely to be compatible.)

This CL adds support for x86_64 SysV and Win64 ABIs. ARM, AArch64, and x86 can
be added in the future. The testing is injected in two places. First, all the
assembly tests in p256-x86_64-test.cc are now instrumented. This is the
intended workflow and should capture all registers.

However, we currently do not unit-test our assembly much directly. We should do
that as follow-up work[0] but, in the meantime, I've also wrapped all of the GTest
main function in an ABI test. This is imperfect as ABI failures may be masked
by other stack frames, but it costs nothing[1] and is pretty reliable at
catching Win64 xmm register failures.

[0] An alternate strategy would be, in debug builds, unconditionally instrument
every assembly call in libcrypto. But the CHECK_ABI macro would be difficult to
replicate in pure C, and unwind testing may be too invasive for this. Still,
something to consider when we C++ libcrypto.

[1] When single-stepped unwind testing exists, it won't cost nothing. The
gtest_main.cc call will turn unwind testing off.

Change-Id: I6643b26445891fd46abfacac52bc024024c8d7f6
Reviewed-on: https://boringssl-review.googlesource.com/c/33764
Reviewed-by: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <alangley@gmail.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index bf69649..8635910 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -132,6 +132,7 @@
     cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT}
     cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT}
     hrss/asm/poly_rq_mul.S
+    test/trampoline-x86_64.${ASM_EXT}
   )
 endif()
 
@@ -141,6 +142,7 @@
 perlasm(chacha/chacha-x86_64.${ASM_EXT} chacha/asm/chacha-x86_64.pl)
 perlasm(cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT} cipher_extra/asm/aes128gcmsiv-x86_64.pl)
 perlasm(cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT} cipher_extra/asm/chacha20_poly1305_x86_64.pl)
+perlasm(test/trampoline-x86_64.${ASM_EXT} test/asm/trampoline-x86_64.pl)
 
 add_custom_command(
   OUTPUT err_data.c
@@ -424,6 +426,7 @@
 add_executable(
   crypto_test
 
+  abi_self_test.cc
   asn1/asn1_test.cc
   base64/base64_test.cc
   buf/buf_test.cc
diff --git a/crypto/abi_self_test.cc b/crypto/abi_self_test.cc
new file mode 100644
index 0000000..cbc771f
--- /dev/null
+++ b/crypto/abi_self_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <gtest/gtest.h>
+#include <gtest/gtest-spi.h>
+
+#include <openssl/rand.h>
+
+#include "test/abi_test.h"
+
+
+static bool test_function_was_called = false;
+static void TestFunction(int a1, int a2, int a3, int a4, int a5, int a6, int a7,
+                         int a8, int a9, int a10) {
+  test_function_was_called = true;
+  EXPECT_EQ(1, a1);
+  EXPECT_EQ(2, a2);
+  EXPECT_EQ(3, a3);
+  EXPECT_EQ(4, a4);
+  EXPECT_EQ(5, a5);
+  EXPECT_EQ(6, a6);
+  EXPECT_EQ(7, a7);
+  EXPECT_EQ(8, a8);
+  EXPECT_EQ(9, a9);
+  EXPECT_EQ(10, a10);
+}
+
+TEST(ABITest, SanityCheck) {
+  EXPECT_NE(0, CHECK_ABI(strcmp, "hello", "world"));
+
+  test_function_was_called = false;
+  CHECK_ABI(TestFunction, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+  EXPECT_TRUE(test_function_was_called);
+
+#if defined(SUPPORTS_ABI_TEST)
+  abi_test::internal::CallerState state;
+  RAND_bytes(reinterpret_cast<uint8_t *>(&state), sizeof(state));
+  const char *arg1 = "hello", *arg2 = "world";
+  crypto_word_t argv[2] = {
+      reinterpret_cast<crypto_word_t>(arg1),
+      reinterpret_cast<crypto_word_t>(arg2),
+  };
+  CHECK_ABI(abi_test_trampoline, reinterpret_cast<crypto_word_t>(strcmp),
+            &state, argv, 2);
+#endif  // SUPPORTS_ABI_TEST
+}
+
+#if defined(OPENSSL_X86_64) && defined(SUPPORTS_ABI_TEST)
+extern "C" {
+void abi_test_clobber_rax(void);
+void abi_test_clobber_rbx(void);
+void abi_test_clobber_rcx(void);
+void abi_test_clobber_rdx(void);
+void abi_test_clobber_rsi(void);
+void abi_test_clobber_rdi(void);
+void abi_test_clobber_rbp(void);
+void abi_test_clobber_r8(void);
+void abi_test_clobber_r9(void);
+void abi_test_clobber_r10(void);
+void abi_test_clobber_r11(void);
+void abi_test_clobber_r12(void);
+void abi_test_clobber_r13(void);
+void abi_test_clobber_r14(void);
+void abi_test_clobber_r15(void);
+void abi_test_clobber_xmm0(void);
+void abi_test_clobber_xmm1(void);
+void abi_test_clobber_xmm2(void);
+void abi_test_clobber_xmm3(void);
+void abi_test_clobber_xmm4(void);
+void abi_test_clobber_xmm5(void);
+void abi_test_clobber_xmm6(void);
+void abi_test_clobber_xmm7(void);
+void abi_test_clobber_xmm8(void);
+void abi_test_clobber_xmm9(void);
+void abi_test_clobber_xmm10(void);
+void abi_test_clobber_xmm11(void);
+void abi_test_clobber_xmm12(void);
+void abi_test_clobber_xmm13(void);
+void abi_test_clobber_xmm14(void);
+void abi_test_clobber_xmm15(void);
+}  // extern "C"
+
+TEST(ABITest, X86_64) {
+  // abi_test_trampoline hides unsaved registers from the caller, so we can
+  // safely call the abi_test_clobber_* functions below.
+  abi_test::internal::CallerState state;
+  RAND_bytes(reinterpret_cast<uint8_t *>(&state), sizeof(state));
+  CHECK_ABI(abi_test_trampoline,
+            reinterpret_cast<crypto_word_t>(abi_test_clobber_rbx), &state,
+            nullptr, 0);
+
+  CHECK_ABI(abi_test_clobber_rax);
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_rbx), "");
+  CHECK_ABI(abi_test_clobber_rcx);
+  CHECK_ABI(abi_test_clobber_rdx);
+#if defined(OPENSSL_WINDOWS)
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_rdi), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_rsi), "");
+#else
+  CHECK_ABI(abi_test_clobber_rdi);
+  CHECK_ABI(abi_test_clobber_rsi);
+#endif
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_rbp), "");
+  CHECK_ABI(abi_test_clobber_r8);
+  CHECK_ABI(abi_test_clobber_r9);
+  CHECK_ABI(abi_test_clobber_r10);
+  CHECK_ABI(abi_test_clobber_r11);
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_r12), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_r13), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_r14), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_r15), "");
+
+  CHECK_ABI(abi_test_clobber_xmm0);
+  CHECK_ABI(abi_test_clobber_xmm1);
+  CHECK_ABI(abi_test_clobber_xmm2);
+  CHECK_ABI(abi_test_clobber_xmm3);
+  CHECK_ABI(abi_test_clobber_xmm4);
+  CHECK_ABI(abi_test_clobber_xmm5);
+#if defined(OPENSSL_WINDOWS)
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm6), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm7), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm8), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm9), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm10), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm11), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm12), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm13), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm14), "");
+  EXPECT_NONFATAL_FAILURE(CHECK_ABI(abi_test_clobber_xmm15), "");
+#else
+  CHECK_ABI(abi_test_clobber_xmm6);
+  CHECK_ABI(abi_test_clobber_xmm7);
+  CHECK_ABI(abi_test_clobber_xmm8);
+  CHECK_ABI(abi_test_clobber_xmm9);
+  CHECK_ABI(abi_test_clobber_xmm10);
+  CHECK_ABI(abi_test_clobber_xmm11);
+  CHECK_ABI(abi_test_clobber_xmm12);
+  CHECK_ABI(abi_test_clobber_xmm13);
+  CHECK_ABI(abi_test_clobber_xmm14);
+  CHECK_ABI(abi_test_clobber_xmm15);
+#endif
+}
+#endif   // OPENSSL_X86_64 && SUPPORTS_ABI_TEST
diff --git a/crypto/fipsmodule/ec/p256-x86_64.h b/crypto/fipsmodule/ec/p256-x86_64.h
index 9de3240..2d70ca7 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/crypto/fipsmodule/ec/p256-x86_64.h
@@ -23,6 +23,8 @@
 
 #include <openssl/bn.h>
 
+#include "../bn/internal.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-x86_64_test.cc
index 7d6d616..202ea7e 100644
--- a/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -28,6 +28,7 @@
 #include "internal.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
+#include "../../test/abi_test.h"
 #include "../../test/file_test.h"
 #include "../../test/test_util.h"
 #include "p256-x86_64.h"
@@ -49,7 +50,7 @@
 
   for (int i = 0; i <= 16; i++) {
     P256_POINT val;
-    ecp_nistz256_select_w5(&val, table, i);
+    CHECK_ABI(ecp_nistz256_select_w5, &val, table, i);
 
     P256_POINT expected;
     if (i == 0) {
@@ -73,7 +74,7 @@
 
   for (int i = 0; i <= 64; i++) {
     P256_POINT_AFFINE val;
-    ecp_nistz256_select_w7(&val, table, i);
+    CHECK_ABI(ecp_nistz256_select_w7, &val, table, i);
 
     P256_POINT_AFFINE expected;
     if (i == 0) {
@@ -106,7 +107,7 @@
   OPENSSL_memset(in, 0, sizeof(in));
 
   // Trying to find the inverse of zero should fail.
-  ASSERT_FALSE(beeu_mod_inverse_vartime(out, in, order_words));
+  ASSERT_FALSE(CHECK_ABI(beeu_mod_inverse_vartime, out, in, order_words));
 
   // kOneMont is 1, in Montgomery form.
   static const BN_ULONG kOneMont[P256_LIMBS] = {
@@ -127,7 +128,7 @@
     }
 
     EXPECT_TRUE(bn_less_than_words(in, order_words, P256_LIMBS));
-    ASSERT_TRUE(beeu_mod_inverse_vartime(out, in, order_words));
+    ASSERT_TRUE(CHECK_ABI(beeu_mod_inverse_vartime, out, in, order_words));
     EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS));
 
     // Calculate out*in and confirm that it equals one, modulo the order.
@@ -140,7 +141,7 @@
     EXPECT_EQ(0, OPENSSL_memcmp(kOneMont, &result, sizeof(kOneMont)));
 
     // Invert the result and expect to get back to the original value.
-    ASSERT_TRUE(beeu_mod_inverse_vartime(out, out, order_words));
+    ASSERT_TRUE(CHECK_ABI(beeu_mod_inverse_vartime, out, out, order_words));
     EXPECT_EQ(0, OPENSSL_memcmp(in, out, sizeof(in)));
   }
 }
@@ -285,19 +286,19 @@
 
   // Test that -A = B.
   BN_ULONG ret[P256_LIMBS];
-  ecp_nistz256_neg(ret, a);
+  CHECK_ABI(ecp_nistz256_neg, ret, a);
   EXPECT_FIELD_ELEMENTS_EQUAL(b, ret);
 
   OPENSSL_memcpy(ret, a, sizeof(ret));
-  ecp_nistz256_neg(ret, ret /* a */);
+  CHECK_ABI(ecp_nistz256_neg, ret, ret /* a */);
   EXPECT_FIELD_ELEMENTS_EQUAL(b, ret);
 
   // Test that -B = A.
-  ecp_nistz256_neg(ret, b);
+  CHECK_ABI(ecp_nistz256_neg, ret, b);
   EXPECT_FIELD_ELEMENTS_EQUAL(a, ret);
 
   OPENSSL_memcpy(ret, b, sizeof(ret));
-  ecp_nistz256_neg(ret, ret /* b */);
+  CHECK_ABI(ecp_nistz256_neg, ret, ret /* b */);
   EXPECT_FIELD_ELEMENTS_EQUAL(a, ret);
 }
 
@@ -308,34 +309,34 @@
   ASSERT_TRUE(GetFieldElement(t, result, "Result"));
 
   BN_ULONG ret[P256_LIMBS];
-  ecp_nistz256_mul_mont(ret, a, b);
+  CHECK_ABI(ecp_nistz256_mul_mont, ret, a, b);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
-  ecp_nistz256_mul_mont(ret, b, a);
+  CHECK_ABI(ecp_nistz256_mul_mont, ret, b, a);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, a, sizeof(ret));
-  ecp_nistz256_mul_mont(ret, ret /* a */, b);
+  CHECK_ABI(ecp_nistz256_mul_mont, ret, ret /* a */, b);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, a, sizeof(ret));
-  ecp_nistz256_mul_mont(ret, b, ret);
+  CHECK_ABI(ecp_nistz256_mul_mont, ret, b, ret);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, b, sizeof(ret));
-  ecp_nistz256_mul_mont(ret, a, ret /* b */);
+  CHECK_ABI(ecp_nistz256_mul_mont, ret, a, ret /* b */);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, b, sizeof(ret));
-  ecp_nistz256_mul_mont(ret, ret /* b */, a);
+  CHECK_ABI(ecp_nistz256_mul_mont, ret, ret /* b */, a);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   if (OPENSSL_memcmp(a, b, sizeof(a)) == 0) {
-    ecp_nistz256_sqr_mont(ret, a);
+    CHECK_ABI(ecp_nistz256_sqr_mont, ret, a);
     EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
     OPENSSL_memcpy(ret, a, sizeof(ret));
-    ecp_nistz256_sqr_mont(ret, ret /* a */);
+    CHECK_ABI(ecp_nistz256_sqr_mont, ret, ret /* a */);
     EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
   }
 }
@@ -346,11 +347,11 @@
   ASSERT_TRUE(GetFieldElement(t, result, "Result"));
 
   BN_ULONG ret[P256_LIMBS];
-  ecp_nistz256_from_mont(ret, a);
+  CHECK_ABI(ecp_nistz256_from_mont, ret, a);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, a, sizeof(ret));
-  ecp_nistz256_from_mont(ret, ret /* a */);
+  CHECK_ABI(ecp_nistz256_from_mont, ret, ret /* a */);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 }
 
@@ -367,26 +368,26 @@
   ASSERT_TRUE(GetFieldElement(t, result.Y, "Result.Y"));
 
   P256_POINT ret;
-  ecp_nistz256_point_add(&ret, &a, &b);
+  CHECK_ABI(ecp_nistz256_point_add, &ret, &a, &b);
   EXPECT_POINTS_EQUAL(&result, &ret);
 
-  ecp_nistz256_point_add(&ret, &b, &a);
+  CHECK_ABI(ecp_nistz256_point_add, &ret, &b, &a);
   EXPECT_POINTS_EQUAL(&result, &ret);
 
   OPENSSL_memcpy(&ret, &a, sizeof(ret));
-  ecp_nistz256_point_add(&ret, &ret /* a */, &b);
+  CHECK_ABI(ecp_nistz256_point_add, &ret, &ret /* a */, &b);
   EXPECT_POINTS_EQUAL(&result, &ret);
 
   OPENSSL_memcpy(&ret, &a, sizeof(ret));
-  ecp_nistz256_point_add(&ret, &b, &ret /* a */);
+  CHECK_ABI(ecp_nistz256_point_add, &ret, &b, &ret /* a */);
   EXPECT_POINTS_EQUAL(&result, &ret);
 
   OPENSSL_memcpy(&ret, &b, sizeof(ret));
-  ecp_nistz256_point_add(&ret, &a, &ret /* b */);
+  CHECK_ABI(ecp_nistz256_point_add, &ret, &a, &ret /* b */);
   EXPECT_POINTS_EQUAL(&result, &ret);
 
   OPENSSL_memcpy(&ret, &b, sizeof(ret));
-  ecp_nistz256_point_add(&ret, &ret /* b */, &a);
+  CHECK_ABI(ecp_nistz256_point_add, &ret, &ret /* b */, &a);
   EXPECT_POINTS_EQUAL(&result, &ret);
 
   P256_POINT_AFFINE a_affine, b_affine, infinity;
@@ -398,27 +399,27 @@
   // point at infinity.
   if (OPENSSL_memcmp(&a_affine, &b_affine, sizeof(a_affine)) != 0 ||
       OPENSSL_memcmp(&a_affine, &infinity, sizeof(a_affine)) == 0) {
-    ecp_nistz256_point_add_affine(&ret, &a, &b_affine);
+    CHECK_ABI(ecp_nistz256_point_add_affine, &ret, &a, &b_affine);
     EXPECT_POINTS_EQUAL(&result, &ret);
 
     OPENSSL_memcpy(&ret, &a, sizeof(ret));
-    ecp_nistz256_point_add_affine(&ret, &ret /* a */, &b_affine);
+    CHECK_ABI(ecp_nistz256_point_add_affine, &ret, &ret /* a */, &b_affine);
     EXPECT_POINTS_EQUAL(&result, &ret);
 
-    ecp_nistz256_point_add_affine(&ret, &b, &a_affine);
+    CHECK_ABI(ecp_nistz256_point_add_affine, &ret, &b, &a_affine);
     EXPECT_POINTS_EQUAL(&result, &ret);
 
     OPENSSL_memcpy(&ret, &b, sizeof(ret));
-    ecp_nistz256_point_add_affine(&ret, &ret /* b */, &a_affine);
+    CHECK_ABI(ecp_nistz256_point_add_affine, &ret, &ret /* b */, &a_affine);
     EXPECT_POINTS_EQUAL(&result, &ret);
   }
 
   if (OPENSSL_memcmp(&a, &b, sizeof(a)) == 0) {
-    ecp_nistz256_point_double(&ret, &a);
+    CHECK_ABI(ecp_nistz256_point_double, &ret, &a);
     EXPECT_POINTS_EQUAL(&result, &ret);
 
     ret = a;
-    ecp_nistz256_point_double(&ret, &ret /* a */);
+    CHECK_ABI(ecp_nistz256_point_double, &ret, &ret /* a */);
     EXPECT_POINTS_EQUAL(&result, &ret);
   }
 }
@@ -432,34 +433,34 @@
   ASSERT_TRUE(GetFieldElement(t, result, "Result"));
 
   BN_ULONG ret[P256_LIMBS];
-  ecp_nistz256_ord_mul_mont(ret, a, b);
+  CHECK_ABI(ecp_nistz256_ord_mul_mont, ret, a, b);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
-  ecp_nistz256_ord_mul_mont(ret, b, a);
+  CHECK_ABI(ecp_nistz256_ord_mul_mont, ret, b, a);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, a, sizeof(ret));
-  ecp_nistz256_ord_mul_mont(ret, ret /* a */, b);
+  CHECK_ABI(ecp_nistz256_ord_mul_mont, ret, ret /* a */, b);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, a, sizeof(ret));
-  ecp_nistz256_ord_mul_mont(ret, b, ret);
+  CHECK_ABI(ecp_nistz256_ord_mul_mont, ret, b, ret);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, b, sizeof(ret));
-  ecp_nistz256_ord_mul_mont(ret, a, ret /* b */);
+  CHECK_ABI(ecp_nistz256_ord_mul_mont, ret, a, ret /* b */);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   OPENSSL_memcpy(ret, b, sizeof(ret));
-  ecp_nistz256_ord_mul_mont(ret, ret /* b */, a);
+  CHECK_ABI(ecp_nistz256_ord_mul_mont, ret, ret /* b */, a);
   EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
   if (OPENSSL_memcmp(a, b, sizeof(a)) == 0) {
-    ecp_nistz256_ord_sqr_mont(ret, a, 1);
+    CHECK_ABI(ecp_nistz256_ord_sqr_mont, ret, a, 1);
     EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
 
     OPENSSL_memcpy(ret, a, sizeof(ret));
-    ecp_nistz256_ord_sqr_mont(ret, ret /* a */, 1);
+    CHECK_ABI(ecp_nistz256_ord_sqr_mont, ret, ret /* a */, 1);
     EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
   }
 }
diff --git a/crypto/test/CMakeLists.txt b/crypto/test/CMakeLists.txt
index 3e02c3c..0b1eab8 100644
--- a/crypto/test/CMakeLists.txt
+++ b/crypto/test/CMakeLists.txt
@@ -3,6 +3,7 @@
 
   OBJECT
 
+  abi_test.cc
   file_test.cc
   malloc.cc
   test_util.cc
diff --git a/crypto/test/abi_test.cc b/crypto/test/abi_test.cc
new file mode 100644
index 0000000..890aa15
--- /dev/null
+++ b/crypto/test/abi_test.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include "abi_test.h"
+
+#include <openssl/rand.h>
+
+
+namespace abi_test {
+namespace internal {
+
+std::string FixVAArgsString(const char *str) {
+  std::string ret = str;
+  size_t idx = ret.find(',');
+  if (idx == std::string::npos) {
+    return ret + "()";
+  }
+  size_t idx2 = idx + 1;
+  while (idx2 < ret.size() && ret[idx2] == ' ') {
+    idx2++;
+  }
+  while (idx > 0 && ret[idx - 1] == ' ') {
+    idx--;
+  }
+  return ret.substr(0, idx) + "(" + ret.substr(idx2) + ")";
+}
+
+#if defined(SUPPORTS_ABI_TEST)
+crypto_word_t RunTrampoline(Result *out, crypto_word_t func,
+                            const crypto_word_t *argv, size_t argc) {
+  CallerState state;
+  RAND_bytes(reinterpret_cast<uint8_t *>(&state), sizeof(state));
+
+  // TODO(davidben): Use OS debugging APIs to single-step |func| and test that
+  // CFI and SEH annotations are correct.
+  CallerState state2 = state;
+  crypto_word_t ret = abi_test_trampoline(func, &state2, argv, argc);
+
+  *out = Result();
+#define CALLER_STATE_REGISTER(type, name)                    \
+  if (state.name != state2.name) {                           \
+    out->errors.push_back(#name " was not restored"); \
+  }
+  LOOP_CALLER_STATE_REGISTERS()
+#undef CALLER_STATE_REGISTER
+  return ret;
+}
+#endif
+
+}  // namespace internal
+}  // namespace abi_test
diff --git a/crypto/test/abi_test.h b/crypto/test/abi_test.h
new file mode 100644
index 0000000..ab9a729
--- /dev/null
+++ b/crypto/test/abi_test.h
@@ -0,0 +1,233 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_ABI_TEST_H
+#define OPENSSL_HEADER_ABI_TEST_H
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <openssl/base.h>
+
+#include "../internal.h"
+
+
+// abi_test provides routines for verifying that functions satisfy platform ABI
+// requirements.
+namespace abi_test {
+
+// Result stores the result of an ABI test.
+struct Result {
+  bool ok() const { return errors.empty(); }
+
+  std::vector<std::string> errors;
+};
+
+namespace internal {
+
+// DeductionGuard wraps |T| in a template, so that template argument deduction
+// does not apply to it. This may be used to force C++ to deduce template
+// arguments from another parameter.
+template <typename T>
+struct DeductionGuard {
+  using Type = T;
+};
+
+// Reg128 contains storage space for a 128-bit register.
+struct alignas(16) Reg128 {
+  bool operator==(const Reg128 &x) const { return x.lo == lo && x.hi == hi; }
+  bool operator!=(const Reg128 &x) const { return !((*this) == x); }
+  uint64_t lo, hi;
+};
+
+// LOOP_CALLER_STATE_REGISTERS is a macro that iterates over all registers the
+// callee is expected to save for the caller.
+//
+// TODO(davidben): Add support for other architectures.
+#if defined(OPENSSL_X86_64)
+#if defined(OPENSSL_WINDOWS)
+// See https://docs.microsoft.com/en-us/cpp/build/x64-software-conventions?view=vs-2017#register-usage
+#define LOOP_CALLER_STATE_REGISTERS()  \
+  CALLER_STATE_REGISTER(uint64_t, rbx) \
+  CALLER_STATE_REGISTER(uint64_t, rdp) \
+  CALLER_STATE_REGISTER(uint64_t, rdi) \
+  CALLER_STATE_REGISTER(uint64_t, rsi) \
+  CALLER_STATE_REGISTER(uint64_t, r12) \
+  CALLER_STATE_REGISTER(uint64_t, r13) \
+  CALLER_STATE_REGISTER(uint64_t, r14) \
+  CALLER_STATE_REGISTER(uint64_t, r15) \
+  CALLER_STATE_REGISTER(Reg128, xmm6)  \
+  CALLER_STATE_REGISTER(Reg128, xmm7)  \
+  CALLER_STATE_REGISTER(Reg128, xmm8)  \
+  CALLER_STATE_REGISTER(Reg128, xmm9)  \
+  CALLER_STATE_REGISTER(Reg128, xmm10) \
+  CALLER_STATE_REGISTER(Reg128, xmm11) \
+  CALLER_STATE_REGISTER(Reg128, xmm12) \
+  CALLER_STATE_REGISTER(Reg128, xmm13) \
+  CALLER_STATE_REGISTER(Reg128, xmm14) \
+  CALLER_STATE_REGISTER(Reg128, xmm15)
+#else
+// See https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
+#define LOOP_CALLER_STATE_REGISTERS()  \
+  CALLER_STATE_REGISTER(uint64_t, rbx) \
+  CALLER_STATE_REGISTER(uint64_t, rbp) \
+  CALLER_STATE_REGISTER(uint64_t, r12) \
+  CALLER_STATE_REGISTER(uint64_t, r13) \
+  CALLER_STATE_REGISTER(uint64_t, r14) \
+  CALLER_STATE_REGISTER(uint64_t, r15)
+#endif  // OPENSSL_WINDOWS
+#endif  // X86_64 && SUPPORTS_ABI_TEST
+
+// Enable ABI testing if all of the following are true.
+//
+// - We have CallerState and trampoline support for the architecture.
+//
+// - Assembly is enabled.
+//
+// - This is not a shared library build. Assembly functions are not reachable
+//   from tests in shared library builds.
+//
+// - This is a debug build. We can instrument release builds as well, but this
+//   ensures we have coverage for both instrumented and uninstrumented code.
+//   See the comment in |CHECK_ABI|. Note ABI testing is only meaningful for
+//   assembly, which is not affected by compiler optimizations.
+#if defined(LOOP_CALLER_STATE_REGISTERS) && !defined(OPENSSL_NO_ASM) && \
+    !defined(BORINGSSL_SHARED_LIBRARY) && !defined(NDEBUG)
+#define SUPPORTS_ABI_TEST
+
+// CallerState contains all caller state that the callee is expected to
+// preserve.
+struct CallerState {
+#define CALLER_STATE_REGISTER(type, name) type name;
+  LOOP_CALLER_STATE_REGISTERS()
+#undef CALLER_STATE_REGISTER
+};
+
+// RunTrampoline runs |func| on |argv|, recording ABI errors in |out|. It does
+// not perform any type-checking.
+crypto_word_t RunTrampoline(Result *out, crypto_word_t func,
+                            const crypto_word_t *argv, size_t argc);
+
+// CheckImpl runs |func| on |args|, recording ABI errors in |out|.
+//
+// It returns the value as a |crypto_word_t| to work around problems when |R| is
+// void. |args| is wrapped in a |DeductionGuard| so |func| determines the
+// template arguments. Otherwise, |args| may deduce |Args| incorrectly. For
+// instance, if |func| takes const int *, and the caller passes an int *, the
+// compiler will complain the deduced types do not match.
+template <typename R, typename... Args>
+inline crypto_word_t CheckImpl(Result *out, R (*func)(Args...),
+                               typename DeductionGuard<Args>::Type... args) {
+  static_assert(sizeof...(args) <= 10,
+                "too many arguments for abi_test_trampoline");
+
+  // Allocate one extra entry so MSVC does not complain about zero-size arrays.
+  crypto_word_t argv[sizeof...(args) + 1] = {
+      (crypto_word_t)args...,
+  };
+  return RunTrampoline(out, reinterpret_cast<crypto_word_t>(func), argv,
+                       sizeof...(args));
+}
+#else
+// To simplify callers when ABI testing support is unavoidable, provide a backup
+// CheckImpl implementation. It must be specialized for void returns because we
+// call |func| directly.
+template <typename R, typename... Args>
+inline typename std::enable_if<!std::is_void<R>::value, crypto_word_t>::type
+CheckImpl(Result *out, R (*func)(Args...),
+          typename DeductionGuard<Args>::Type... args) {
+  *out = Result();
+  return func(args...);
+}
+
+template <typename... Args>
+inline crypto_word_t CheckImpl(Result *out, void (*func)(Args...),
+                               typename DeductionGuard<Args>::Type... args) {
+  *out = Result();
+  func(args...);
+  return 0;
+}
+#endif  // SUPPORTS_ABI_TEST
+
+// FixVAArgsString takes a string like "f, 1, 2" and returns a string like
+// "f(1, 2)".
+//
+// This is needed because the |CHECK_ABI| macro below cannot be defined as
+// CHECK_ABI(func, ...). The C specification requires that variadic macros bind
+// at least one variadic argument. Clang, GCC, and MSVC all ignore this, but
+// there are issues with trailing commas and different behaviors across
+// compilers.
+std::string FixVAArgsString(const char *str);
+
+// CheckGTest behaves like |CheckImpl|, but it returns the correct type and
+// raises GTest assertions on failure.
+template <typename R, typename... Args>
+inline R CheckGTest(const char *va_args_str, const char *file, int line,
+                    R (*func)(Args...),
+                    typename DeductionGuard<Args>::Type... args) {
+  Result result;
+  crypto_word_t ret = CheckImpl(&result, func, args...);
+  if (!result.ok()) {
+    testing::Message msg;
+    msg << "ABI failures in " << FixVAArgsString(va_args_str) << ":\n";
+    for (const auto &error : result.errors) {
+      msg << "    " << error << "\n";
+    }
+    ADD_FAILURE_AT(file, line) << msg;
+  }
+  return (R)ret;
+}
+
+}  // namespace internal
+
+// Check runs |func| on |args| and returns the result. If ABI-testing is
+// supported in this build configuration, it writes any ABI failures to |out|.
+// Otherwise, it runs the function transparently.
+template <typename R, typename... Args>
+inline R Check(Result *out, R (*func)(Args...),
+               typename internal::DeductionGuard<Args>::Type... args) {
+  return (R)internal::CheckImpl(out, func, args...);
+}
+
+}  // namespace abi_test
+
+// CHECK_ABI calls the first argument on the remaining arguments and returns the
+// result. If ABI-testing is supported in this build configuration, it adds a
+// non-fatal GTest failure if the call did not satisfy ABI requirements.
+//
+// |CHECK_ABI| does return the value and thus may replace any function call,
+// provided it takes only simple parameters. It is recommended to integrate it
+// into functional tests of assembly. To ensure coverage of both instrumented
+// and uninstrumented calls, ABI testing is disabled in release-mode tests.
+#define CHECK_ABI(...) \
+  abi_test::internal::CheckGTest(#__VA_ARGS__, __FILE__, __LINE__, __VA_ARGS__)
+
+
+// Internal functions.
+
+#if defined(SUPPORTS_ABI_TEST)
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. We give |func| type |crypto_word_t| to avoid tripping
+// MSVC's warning 4191.
+extern "C" crypto_word_t abi_test_trampoline(
+    crypto_word_t func, abi_test::internal::CallerState *state,
+    const crypto_word_t *argv, size_t argc);
+#endif  // SUPPORTS_ABI_TEST
+
+
+#endif  // OPENSSL_HEADER_ABI_TEST_H
diff --git a/crypto/test/asm/trampoline-x86_64.pl b/crypto/test/asm/trampoline-x86_64.pl
new file mode 100755
index 0000000..b1f9b93
--- /dev/null
+++ b/crypto/test/asm/trampoline-x86_64.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+# Copyright (c) 2018, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# This file defines helper functions for crypto/test/abi_test.h on x86_64. See
+# that header for details on how to use this.
+#
+# For convenience, this file is linked into libcrypto, where consuming builds
+# already support architecture-specific sources. The static linker should drop
+# this code in non-test binaries. This includes a shared library build of
+# libcrypto, provided --gc-sections (ELF), -dead_strip (Mac), or equivalent is
+# used.
+#
+# References:
+#
+# SysV ABI: https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf
+# Win64 ABI: https://docs.microsoft.com/en-us/cpp/build/x64-software-conventions?view=vs-2017
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64 = 0;
+$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+# @inp is the registers used for function inputs, in order.
+my @inp = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
+                   ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9");
+
+# @caller_state is the list of registers that the callee must preserve for the
+# caller. This must match the definition of CallerState in abi_test.h.
+my @caller_state = ("%rbx", "%rbp", "%r12", "%r13", "%r14", "%r15");
+if ($win64) {
+  @caller_state = ("%rbx", "%rbp", "%rdi", "%rsi", "%r12", "%r13", "%r14",
+                   "%r15", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10",
+                   "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
+}
+
+# $caller_state_size is the size of CallerState, in bytes.
+my $caller_state_size = 0;
+foreach (@caller_state) {
+  if (/^%r/) {
+    $caller_state_size += 8;
+  } elsif (/^%xmm/) {
+    $caller_state_size += 16;
+  } else {
+    die "unknown register $_";
+  }
+}
+
+# load_caller_state returns code which loads a CallerState structure at
+# $off($reg) into the respective registers. No other registers are touched, but
+# $reg may not be a register in CallerState. $cb is an optional callback to
+# add extra lines after each movq or movdqa. $cb is passed the offset, relative
+# to $reg, and name of each register.
+sub load_caller_state {
+  my ($off, $reg, $cb) = @_;
+  my $ret = "";
+  foreach (@caller_state) {
+    my $old_off = $off;
+    if (/^%r/) {
+      $ret .= "\tmovq\t$off($reg), $_\n";
+      $off += 8;
+    } elsif (/^%xmm/) {
+      $ret .= "\tmovdqa\t$off($reg), $_\n";
+      $off += 16;
+    } else {
+      die "unknown register $_";
+    }
+    $ret .= $cb->($old_off, $_) if (defined($cb));
+  }
+  return $ret;
+}
+
+# store_caller_state behaves like load_caller_state, except that it writes the
+# current values of the registers into $off($reg).
+sub store_caller_state {
+  my ($off, $reg, $cb) = @_;
+  my $ret = "";
+  foreach (@caller_state) {
+    my $old_off = $off;
+    if (/^%r/) {
+      $ret .= "\tmovq\t$_, $off($reg)\n";
+      $off += 8;
+    } elsif (/^%xmm/) {
+      $ret .= "\tmovdqa\t$_, $off($reg)\n";
+      $off += 16;
+    } else {
+      die "unknown register $_";
+    }
+    $ret .= $cb->($old_off, $_) if (defined($cb));
+  }
+  return $ret;
+}
+
+# $max_params is the maximum number of parameters abi_test_trampoline supports.
+my $max_params = 10;
+
+# Windows reserves stack space for the register-based parameters, while SysV
+# only reserves space for the overflow ones.
+my $stack_params_skip = $win64 ? scalar(@inp) : 0;
+my $num_stack_params = $win64 ? $max_params : $max_params - scalar(@inp);
+
+my ($func, $state, $argv, $argc) = @inp;
+my $code = <<____;
+.text
+
+# abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+# with |argv|, then saves the callee-saved registers into |state|. It returns
+# the result of |func|.
+# uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+#                              const uint64_t *argv, size_t argc);
+.type	abi_test_trampoline, \@abi-omnipotent
+.globl	abi_test_trampoline
+.align	16
+abi_test_trampoline:
+.cfi_startproc
+	# Stack layout:
+	#   8 bytes - align
+	#   $caller_state_size bytes - saved caller registers
+	#   8 bytes - scratch space
+	#   8 bytes - saved copy of \$state
+	#   8 bytes - saved copy of \$func
+	#   8 bytes - if needed for stack alignment
+	#   8*$num_stack_params bytes - parameters for \$func
+____
+my $stack_alloc_size = 8 + $caller_state_size + 8*3 + 8*$num_stack_params;
+# SysV and Windows both require the stack to be 16-byte-aligned. The call
+# instruction offsets it by 8, so stack allocations must be 8 mod 16.
+if ($stack_alloc_size % 16 != 8) {
+  $num_stack_params++;
+  $stack_alloc_size += 8;
+}
+my $stack_params_offset = 8 * $stack_params_skip;
+my $func_offset = 8 * $num_stack_params;
+my $state_offset = $func_offset + 8;
+my $scratch_offset = $state_offset + 8;
+my $caller_state_offset = $scratch_offset + 8;
+$code .= <<____;
+	subq	\$$stack_alloc_size, %rsp
+.cfi_adjust_cfa_offset	$stack_alloc_size
+____
+# Store our caller's state. This is needed because we modify it ourselves, and
+# also to isolate the test infrastruction from the function under test failing
+# to save some register.
+$code .= store_caller_state($caller_state_offset, "%rsp", sub {
+  my ($off, $reg) = @_;
+  $reg = substr($reg, 1);
+  $off -= $stack_alloc_size + 8;
+  return ".cfi_offset\t$reg, $off\n";
+});
+
+$code .= load_caller_state(0, $state);
+$code .= <<____;
+	# Stash \$func and \$state, so they are available after the call returns.
+	movq	$func, $func_offset(%rsp)
+	movq	$state, $state_offset(%rsp)
+
+	# Load parameters. Note this will clobber \$argv and \$argc, so we can
+	# only use non-parameter volatile registers. There are three, and they
+	# are the same between SysV and Win64: %rax, %r10, and %r11.
+	movq	$argv, %r10
+	movq	$argc, %r11
+____
+foreach (@inp) {
+	$code .= <<____;
+	dec	%r11
+	js	.Lcall
+	movq	(%r10), $_
+	addq	\$8, %r10
+____
+}
+$code .= <<____;
+	leaq	$stack_params_offset(%rsp), %rax
+.Largs_loop:
+	dec	%r11
+	js	.Lcall
+
+  # This block should be:
+  #    movq (%r10), %rtmp
+  #    movq %rtmp, (%rax)
+  # There are no spare registers available, so we spill into the scratch space.
+	movq	%r11, $scratch_offset(%rsp)
+	movq	(%r10), %r11
+	movq	%r11, (%rax)
+	movq	$scratch_offset(%rsp), %r11
+
+	addq	\$8, %r10
+	addq	\$8, %rax
+	jmp	.Largs_loop
+
+.Lcall:
+	movq	$func_offset(%rsp), %rax
+	call	*%rax
+
+	# Store what \$func did our state, so our caller can check.
+  movq  $state_offset(%rsp), $state
+____
+$code .= store_caller_state(0, $state);
+
+# Restore our caller's state.
+$code .= load_caller_state($caller_state_offset, "%rsp", sub {
+  my ($off, $reg) = @_;
+  $reg = substr($reg, 1);
+  return ".cfi_restore\t$reg\n";
+});
+$code .= <<____;
+	addq	\$$stack_alloc_size, %rsp
+.cfi_adjust_cfa_offset	-$stack_alloc_size
+
+  # %rax already contains \$func's return value, unmodified.
+	ret
+.cfi_endproc
+.size	abi_test_trampoline,.-abi_test_trampoline
+____
+
+# abi_test_clobber_* zeros the corresponding register. These are used to test
+# the ABI-testing framework.
+foreach ("ax", "bx", "cx", "dx", "di", "si", "bp", 8..15) {
+  $code .= <<____;
+.type	abi_test_clobber_r$_, \@abi-omnipotent
+.globl	abi_test_clobber_r$_
+.align	16
+abi_test_clobber_r$_:
+	xorq	%r$_, %r$_
+	ret
+.size	abi_test_clobber_r$_,.-abi_test_clobber_r$_
+____
+}
+
+foreach (0..15) {
+  $code .= <<____;
+.type	abi_test_clobber_xmm$_, \@abi-omnipotent
+.globl	abi_test_clobber_xmm$_
+.align	16
+abi_test_clobber_xmm$_:
+	pxor	%xmm$_, %xmm$_
+	ret
+.size	abi_test_clobber_xmm$_,.-abi_test_clobber_xmm$_
+____
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/test/gtest_main.cc b/crypto/test/gtest_main.cc
index a557168..f19b830 100644
--- a/crypto/test/gtest_main.cc
+++ b/crypto/test/gtest_main.cc
@@ -20,6 +20,7 @@
 #include <openssl/cpu.h>
 #include <openssl/rand.h>
 
+#include "abi_test.h"
 #include "gtest_main.h"
 #include "../internal.h"
 
@@ -71,5 +72,17 @@
   }
 #endif  // TEST_ARM_CPUS
 
-  return RUN_ALL_TESTS();
+  // Run the entire test suite under an ABI check. This is less effective than
+  // testing the individual assembly functions, but will catch issues with
+  // rarely-used registers.
+  abi_test::Result abi;
+  int ret = abi_test::Check(&abi, RUN_ALL_TESTS);
+  if (!abi.ok()) {
+    fprintf(stderr, "ABI failure in test suite:\n");
+    for (const auto &error : abi.errors) {
+      fprintf(stderr, "    %s\n", error.c_str());
+    }
+    exit(1);
+  }
+  return ret;
 }