Check static CPU capabilities on x86.

On Arm, our CRYPTO_is_*_capable functions check the corresponding
preprocessor symbol. This allows us to automatically drop dynamic checks
and fallback code when some capability is always avilable.

This CL does the same on x86, as well as consolidates our
OPENSSL_ia32cap_P checks in one place. Since this abstraction is
incompatible with some optimizations we do around OPENSSL_ia32cap_get()
in the FIPS module, I've marked the symbol __attribute__((const)), which
is enough to make GCC and Clang do the optimizations for us. (We already
do the same to DEFINE_BSS_GET.)

Most x86 platforms support a much wider range of capabilities, so this
is usually a no-op. But, notably, all x86_64 Mac hardware has SSSE3
available, so this allows us to statically drop an AES implementation.
(On macOS with -Wl,-dead_strip, this seems to trim 35080 bytes from the
bssl binary.) Configs like -march=native can also drop a bunch of code.

Update-Note: This CL may break build environments that incorrectly mark
some instruction as statically available. This is unlikely to happen
with vector instructions like AVX, where the compiler could freely emit
them anyway. However, instructions like AES-NI might be set incorrectly.

Change-Id: I44fd715c9887d3fda7cb4519c03bee4d4f2c7ea6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51548
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/cipher_extra/e_aesgcmsiv.c b/crypto/cipher_extra/e_aesgcmsiv.c
index 9e77375..387eaff 100644
--- a/crypto/cipher_extra/e_aesgcmsiv.c
+++ b/crypto/cipher_extra/e_aesgcmsiv.c
@@ -857,22 +857,15 @@
 
 #if defined(AES_GCM_SIV_ASM)
 
-static char avx_aesni_capable(void) {
-  const uint32_t ecx = OPENSSL_ia32cap_P[1];
-
-  return (ecx & (1 << (57 - 32))) != 0 /* AESNI */ &&
-         (ecx & (1 << 28)) != 0 /* AVX */;
-}
-
 const EVP_AEAD *EVP_aead_aes_128_gcm_siv(void) {
-  if (avx_aesni_capable()) {
+  if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) {
     return &aead_aes_128_gcm_siv_asm;
   }
   return &aead_aes_128_gcm_siv;
 }
 
 const EVP_AEAD *EVP_aead_aes_256_gcm_siv(void) {
-  if (avx_aesni_capable()) {
+  if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) {
     return &aead_aes_256_gcm_siv_asm;
   }
   return &aead_aes_256_gcm_siv;
diff --git a/crypto/cipher_extra/internal.h b/crypto/cipher_extra/internal.h
index 0f5f566..4e8fa46 100644
--- a/crypto/cipher_extra/internal.h
+++ b/crypto/cipher_extra/internal.h
@@ -171,8 +171,7 @@
                       "wrong chacha20_poly1305_seal_data size");
 
 OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) {
-  const int sse41_capable = (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
-  return sse41_capable;
+  return CRYPTO_is_SSE4_1_capable();
 }
 
 // chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It decrypts
diff --git a/crypto/fipsmodule/aes/internal.h b/crypto/fipsmodule/aes/internal.h
index 9f7dd47..0685bc4 100644
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@@ -30,18 +30,14 @@
 #define HWAES
 #define HWAES_ECB
 
-OPENSSL_INLINE int hwaes_capable(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1 << (57 - 32))) != 0;
-}
+OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_AESNI_capable(); }
 
 #define VPAES
 #if defined(OPENSSL_X86_64)
 #define VPAES_CTR32
 #endif
 #define VPAES_CBC
-OPENSSL_INLINE int vpaes_capable(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
-}
+OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_SSSE3_capable(); }
 
 #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 #define HWAES
diff --git a/crypto/fipsmodule/bn/rsaz_exp.h b/crypto/fipsmodule/bn/rsaz_exp.h
index b8150f1..104bb7a 100644
--- a/crypto/fipsmodule/bn/rsaz_exp.h
+++ b/crypto/fipsmodule/bn/rsaz_exp.h
@@ -41,19 +41,17 @@
                             BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);
 
 OPENSSL_INLINE int rsaz_avx2_capable(void) {
-  const uint32_t *cap = OPENSSL_ia32cap_get();
-  return (cap[2] & (1 << 5)) != 0;  // AVX2
+  return CRYPTO_is_AVX2_capable();
 }
 
 OPENSSL_INLINE int rsaz_avx2_preferred(void) {
-  const uint32_t *cap = OPENSSL_ia32cap_get();
-  static const uint32_t kBMI1BMI2AndADX = (1 << 3) | (1 << 8) | (1 << 19);
-  if ((cap[2] & kBMI1BMI2AndADX) == kBMI1BMI2AndADX) {
+  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
+      CRYPTO_is_ADX_capable()) {
     // If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the
     // .Lmulx4x_enter and .Lpowerx5_enter branches.
     return 0;
   }
-  return (cap[2] & (1 << 5)) != 0;  // AVX2
+  return CRYPTO_is_AVX2_capable();
 }
 
 
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c
index 99deb36..506b7d2 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -554,7 +554,7 @@
 static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group,
                                                  EC_SCALAR *out,
                                                  const EC_SCALAR *in) {
-  if ((OPENSSL_ia32cap_get()[1] & (1 << 28)) == 0) {
+  if (!CRYPTO_is_AVX_capable()) {
     // No AVX support; fallback to generic code.
     return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in);
   }
diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-x86_64_test.cc
index a083f3d..f6f070a 100644
--- a/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -98,7 +98,7 @@
 }
 
 TEST(P256_X86_64Test, BEEU) {
-  if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+  if (!CRYPTO_is_AVX_capable()) {
     // No AVX support; cannot run the BEEU code.
     return;
   }
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index 28218b4..5b909aa 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -152,7 +152,7 @@
 
 #if defined(GHASH_ASM_X86_64)
   if (crypto_gcm_clmul_enabled()) {
-    if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) {  // AVX+MOVBE
+    if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
       gcm_init_avx(out_table, H.u);
       *out_mult = gcm_gmult_avx;
       *out_hash = gcm_ghash_avx;
@@ -164,7 +164,7 @@
     *out_hash = gcm_ghash_clmul;
     return;
   }
-  if (gcm_ssse3_capable()) {
+  if (CRYPTO_is_SSSE3_capable()) {
     gcm_init_ssse3(out_table, H.u);
     *out_mult = gcm_gmult_ssse3;
     *out_hash = gcm_ghash_ssse3;
@@ -177,7 +177,7 @@
     *out_hash = gcm_ghash_clmul;
     return;
   }
-  if (gcm_ssse3_capable()) {
+  if (CRYPTO_is_SSSE3_capable()) {
     gcm_init_ssse3(out_table, H.u);
     *out_mult = gcm_gmult_ssse3;
     *out_hash = gcm_ghash_ssse3;
@@ -722,9 +722,7 @@
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
 int crypto_gcm_clmul_enabled(void) {
 #if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
-  const uint32_t *ia32cap = OPENSSL_ia32cap_get();
-  return (ia32cap[0] & (1 << 24)) &&  // check FXSR bit
-         (ia32cap[1] & (1 << 1));     // check PCLMULQDQ bit
+  return CRYPTO_is_FXSR_capable() && CRYPTO_is_PCLMUL_capable();
 #else
   return 0;
 #endif
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index 539b764..d66d8ae 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -136,7 +136,7 @@
 
   alignas(16) u128 Htable[16];
 #if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
-  if (gcm_ssse3_capable()) {
+  if (CRYPTO_is_SSSE3_capable()) {
     CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH);
     CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable);
     for (size_t blocks : kBlockCounts) {
@@ -152,7 +152,7 @@
     }
 
 #if defined(GHASH_ASM_X86_64)
-    if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) {  // AVX+MOVBE
+    if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
       CHECK_ABI_SEH(gcm_init_avx, Htable, kH);
       CHECK_ABI_SEH(gcm_gmult_avx, X, Htable);
       for (size_t blocks : kBlockCounts) {
diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h
index f022f9b..0164aac 100644
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@@ -253,10 +253,6 @@
 void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                      size_t len);
 
-OPENSSL_INLINE char gcm_ssse3_capable(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
-}
-
 // |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
 // 16-byte-aligned, but |gcm_init_ssse3| does not.
 void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
diff --git a/crypto/fipsmodule/rand/internal.h b/crypto/fipsmodule/rand/internal.h
index bbeef76..eccf047 100644
--- a/crypto/fipsmodule/rand/internal.h
+++ b/crypto/fipsmodule/rand/internal.h
@@ -143,15 +143,14 @@
 #if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM)
 
 OPENSSL_INLINE int have_rdrand(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0;
+  return CRYPTO_is_RDRAND_capable();
 }
 
 // have_fast_rdrand returns true if RDRAND is supported and it's reasonably
 // fast. Concretely the latter is defined by whether the chip is Intel (fast) or
 // not (assumed slow).
 OPENSSL_INLINE int have_fast_rdrand(void) {
-  const uint32_t *const ia32cap = OPENSSL_ia32cap_get();
-  return (ia32cap[1] & (1u << 30)) && (ia32cap[0] & (1u << 30));
+  return CRYPTO_is_RDRAND_capable() && CRYPTO_is_intel_cpu();
 }
 
 // CRYPTO_rdrand writes eight bytes of random data from the hardware RNG to
diff --git a/crypto/hrss/hrss.c b/crypto/hrss/hrss.c
index 8e21068..388c9a9 100644
--- a/crypto/hrss/hrss.c
+++ b/crypto/hrss/hrss.c
@@ -1314,8 +1314,7 @@
 static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r,
                      const struct poly *a, const struct poly *b) {
 #if defined(POLY_RQ_MUL_ASM)
-  const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
-  if (has_avx2) {
+  if (CRYPTO_is_AVX2_capable()) {
     poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq);
     return;
   }
diff --git a/crypto/hrss/hrss_test.cc b/crypto/hrss/hrss_test.cc
index 0693c82..bab968c 100644
--- a/crypto/hrss/hrss_test.cc
+++ b/crypto/hrss/hrss_test.cc
@@ -453,8 +453,7 @@
 
 #if defined(POLY_RQ_MUL_ASM) && defined(SUPPORTS_ABI_TEST)
 TEST(HRSS, ABI) {
-  const bool has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
-  if (!has_avx2) {
+  if (!CRYPTO_is_AVX2_capable()) {
     fprintf(stderr, "Skipping ABI test due to lack of AVX2 support.\n");
     return;
   }
diff --git a/crypto/impl_dispatch_test.cc b/crypto/impl_dispatch_test.cc
index dae9e96..631e78f 100644
--- a/crypto/impl_dispatch_test.cc
+++ b/crypto/impl_dispatch_test.cc
@@ -33,9 +33,9 @@
  public:
   void SetUp() override {
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-    aesni_ = OPENSSL_ia32cap_P[1] & (1 << (57 - 32));
-    avx_movbe_ = ((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41;
-    ssse3_ = OPENSSL_ia32cap_P[1] & (1 << (41 - 32));
+    aesni_ = CRYPTO_is_AESNI_capable();
+    avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable();
+    ssse3_ = CRYPTO_is_SSSE3_capable();
     is_x86_64_ =
 #if defined(OPENSSL_X86_64)
         true;
diff --git a/crypto/internal.h b/crypto/internal.h
index a85a60d..032660a 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1012,14 +1012,126 @@
 extern uint32_t OPENSSL_ia32cap_P[4];
 
 #if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY)
-const uint32_t *OPENSSL_ia32cap_get(void);
+// The FIPS module, as a static library, requires an out-of-line version of
+// |OPENSSL_ia32cap_get| so accesses can be rewritten by delocate. Mark the
+// function const so multiple accesses can be optimized together.
+const uint32_t *OPENSSL_ia32cap_get(void) __attribute__((const));
 #else
 OPENSSL_INLINE const uint32_t *OPENSSL_ia32cap_get(void) {
   return OPENSSL_ia32cap_P;
 }
 #endif
 
+// See Intel manual, volume 2A, table 3-11.
+
+OPENSSL_INLINE int CRYPTO_is_FXSR_capable(void) {
+#if defined(__FXSR__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[0] & (1 << 24)) != 0;
 #endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_intel_cpu(void) {
+  // The reserved bit 30 is used to indicate an Intel CPU.
+  return (OPENSSL_ia32cap_get()[0] & (1 << 30)) != 0;
+}
+
+// See Intel manual, volume 2A, table 3-10.
+
+OPENSSL_INLINE int CRYPTO_is_PCLMUL_capable(void) {
+#if defined(__PCLMUL__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 1)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_SSSE3_capable(void) {
+#if defined(__SSSE3__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 9)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_SSE4_1_capable(void) {
+#if defined(__SSE4_1__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_MOVBE_capable(void) {
+#if defined(__MOVBE__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 22)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AESNI_capable(void) {
+#if defined(__AES__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 25)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AVX_capable(void) {
+#if defined(__AVX__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 28)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_RDRAND_capable(void) {
+  // The GCC/Clang feature name and preprocessor symbol for RDRAND are "rdrnd"
+  // and |__RDRND__|, respectively.
+#if defined(__RDRND__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0;
+#endif
+}
+
+// See Intel manual, volume 2A, table 3-8.
+
+OPENSSL_INLINE int CRYPTO_is_BMI1_capable(void) {
+#if defined(__BMI1__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 3)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AVX2_capable(void) {
+#if defined(__AVX2__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 5)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_BMI2_capable(void) {
+#if defined(__BMI2__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 8)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_ADX_capable(void) {
+#if defined(__ADX__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 19)) != 0;
+#endif
+}
+
+#endif  // OPENSSL_X86 || OPENSSL_X86_64
 
 #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)