Update fiat-crypto.

The files no longer need to be patched because fiat-crypto now has its
own copy of our value barrier. It does, however, require syncing our
NO_ASM define with fiat's.

fiat-crypto is now licensed under any of MIT, BSD 1-clause, or Apache 2.
I've stuck with the MIT one as that's what we were previously importing.

No measurable perf difference before/after this CL, with GCC or Clang on
x86_64.

Change-Id: I2939fd517de37aabdea3ead49150135200a1b112
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/52045
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c
index 7cb0add..e316acd 100644
--- a/crypto/curve25519/curve25519.c
+++ b/crypto/curve25519/curve25519.c
@@ -36,6 +36,10 @@
 // Various pre-computed constants.
 #include "./curve25519_tables.h"
 
+#if defined(OPENSSL_NO_ASM)
+#define FIAT_25519_NO_ASM
+#endif
+
 #if defined(BORINGSSL_CURVE25519_64BIT)
 #include "../../third_party/fiat/curve25519_64.h"
 #else
diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
index 9f5694c..0d0e766 100644
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@@ -31,8 +31,10 @@
 #include "../delocate.h"
 #include "./internal.h"
 
+#if defined(OPENSSL_NO_ASM)
+#define FIAT_P256_NO_ASM
+#endif
 
-// MSVC does not implement uint128_t, and crashes with intrinsics
 #if defined(BORINGSSL_HAS_UINT128)
 #define BORINGSSL_NISTP256_64BIT 1
 #include "../../../third_party/fiat/p256_64.h"
diff --git a/third_party/fiat/LICENSE b/third_party/fiat/LICENSE
index bd46c61..70cae03 100644
--- a/third_party/fiat/LICENSE
+++ b/third_party/fiat/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2015-2016 the fiat-crypto authors (see
+Copyright (c) 2015-2020 the fiat-crypto authors (see
 https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS).
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/third_party/fiat/METADATA b/third_party/fiat/METADATA
index e527c14..339fe5f 100644
--- a/third_party/fiat/METADATA
+++ b/third_party/fiat/METADATA
@@ -6,8 +6,8 @@
     type: GIT
     value: "https://github.com/mit-plv/fiat-crypto"
   }
-  version: "0884b6d374a9d937c44bf024fe3a647ffae2c540"
-  last_upgrade_date { year: 2020 month: 4 day: 16 }
+  version: "6ccc6638716d4632304baf1adbb5c47c3a12ea6f"
+  last_upgrade_date { year: 2022 month: 3 day: 22 }
 
-  local_modifications: "Files renamed to .h for BoringSSL integration. Select functions patched with value barriers."
+  local_modifications: "Files renamed to .h for BoringSSL integration. LICENSE file is LICENSE-MIT from upstream."
 }
diff --git a/third_party/fiat/curve25519_32.h b/third_party/fiat/curve25519_32.h
index 7b78d00..cb83c60 100644
--- a/third_party/fiat/curve25519_32.h
+++ b/third_party/fiat/curve25519_32.h
@@ -1,24 +1,51 @@
-/* Autogenerated: src/ExtractionOCaml/unsaturated_solinas --static 25519 10 '2^255 - 19' 32 carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes carry_scmul121666 */
+/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 32 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */
 /* curve description: 25519 */
-/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, carry_scmul121666 */
-/* n = 10 (from "10") */
-/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */
 /* machine_wordsize = 32 (from "32") */
-
+/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */
+/* n = 10 (from "(auto)") */
+/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */
+/* tight_bounds_multiplier = 1 (from "") */
+/*  */
 /* Computed values: */
-/* carry_chain = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1] */
+/*   carry_chain = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1] */
+/*   eval z = z[0] + (z[1] << 26) + (z[2] << 51) + (z[3] << 77) + (z[4] << 102) + (z[5] << 128) + (z[6] << 153) + (z[7] << 179) + (z[8] << 204) + (z[9] << 230) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   balance = [0x7ffffda, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe, 0x7fffffe, 0x3fffffe] */
 
 #include <stdint.h>
 typedef unsigned char fiat_25519_uint1;
 typedef signed char fiat_25519_int1;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_25519_FIAT_INLINE __inline__
+#else
+#  define FIAT_25519_FIAT_INLINE
+#endif
+
+/* The type fiat_25519_loose_field_element is a field element with loose bounds. */
+/* Bounds: [[0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000], [0x0 ~> 0xc000000], [0x0 ~> 0x6000000]] */
+typedef uint32_t fiat_25519_loose_field_element[10];
+
+/* The type fiat_25519_tight_field_element is a field element with tight bounds. */
+/* Bounds: [[0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000], [0x0 ~> 0x4000000], [0x0 ~> 0x2000000]] */
+typedef uint32_t fiat_25519_tight_field_element[10];
 
 #if (-1 & 3) != 3
 #error "This code only works on a two's complement system"
 #endif
 
+#if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint32_t fiat_25519_value_barrier_u32(uint32_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_25519_value_barrier_u32(x) (x)
+#endif
+
 
 /*
  * The function fiat_25519_addcarryx_u26 is an addition with carry.
+ *
  * Postconditions:
  *   out1 = (arg1 + arg2 + arg3) mod 2^26
  *   out2 = ⌊(arg1 + arg2 + arg3) / 2^26⌋
@@ -31,16 +58,20 @@
  *   out1: [0x0 ~> 0x3ffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_25519_addcarryx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  uint32_t x1 = ((arg1 + arg2) + arg3);
-  uint32_t x2 = (x1 & UINT32_C(0x3ffffff));
-  fiat_25519_uint1 x3 = (fiat_25519_uint1)(x1 >> 26);
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  uint32_t x1;
+  uint32_t x2;
+  fiat_25519_uint1 x3;
+  x1 = ((arg1 + arg2) + arg3);
+  x2 = (x1 & UINT32_C(0x3ffffff));
+  x3 = (fiat_25519_uint1)(x1 >> 26);
   *out1 = x2;
   *out2 = x3;
 }
 
 /*
  * The function fiat_25519_subborrowx_u26 is a subtraction with borrow.
+ *
  * Postconditions:
  *   out1 = (-arg1 + arg2 + -arg3) mod 2^26
  *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^26⌋
@@ -53,16 +84,20 @@
  *   out1: [0x0 ~> 0x3ffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_25519_subborrowx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  int32_t x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
-  fiat_25519_int1 x2 = (fiat_25519_int1)(x1 >> 26);
-  uint32_t x3 = (x1 & UINT32_C(0x3ffffff));
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u26(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  int32_t x1;
+  fiat_25519_int1 x2;
+  uint32_t x3;
+  x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
+  x2 = (fiat_25519_int1)(x1 >> 26);
+  x3 = (x1 & UINT32_C(0x3ffffff));
   *out1 = x3;
   *out2 = (fiat_25519_uint1)(0x0 - x2);
 }
 
 /*
  * The function fiat_25519_addcarryx_u25 is an addition with carry.
+ *
  * Postconditions:
  *   out1 = (arg1 + arg2 + arg3) mod 2^25
  *   out2 = ⌊(arg1 + arg2 + arg3) / 2^25⌋
@@ -75,16 +110,20 @@
  *   out1: [0x0 ~> 0x1ffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_25519_addcarryx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  uint32_t x1 = ((arg1 + arg2) + arg3);
-  uint32_t x2 = (x1 & UINT32_C(0x1ffffff));
-  fiat_25519_uint1 x3 = (fiat_25519_uint1)(x1 >> 25);
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  uint32_t x1;
+  uint32_t x2;
+  fiat_25519_uint1 x3;
+  x1 = ((arg1 + arg2) + arg3);
+  x2 = (x1 & UINT32_C(0x1ffffff));
+  x3 = (fiat_25519_uint1)(x1 >> 25);
   *out1 = x2;
   *out2 = x3;
 }
 
 /*
  * The function fiat_25519_subborrowx_u25 is a subtraction with borrow.
+ *
  * Postconditions:
  *   out1 = (-arg1 + arg2 + -arg3) mod 2^25
  *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^25⌋
@@ -97,16 +136,20 @@
  *   out1: [0x0 ~> 0x1ffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_25519_subborrowx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  int32_t x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
-  fiat_25519_int1 x2 = (fiat_25519_int1)(x1 >> 25);
-  uint32_t x3 = (x1 & UINT32_C(0x1ffffff));
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u25(uint32_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  int32_t x1;
+  fiat_25519_int1 x2;
+  uint32_t x3;
+  x1 = ((int32_t)(arg2 - arg1) - (int32_t)arg3);
+  x2 = (fiat_25519_int1)(x1 >> 25);
+  x3 = (x1 & UINT32_C(0x1ffffff));
   *out1 = x3;
   *out2 = (fiat_25519_uint1)(0x0 - x2);
 }
 
 /*
  * The function fiat_25519_cmovznz_u32 is a single-word conditional move.
+ *
  * Postconditions:
  *   out1 = (if arg1 = 0 then arg2 else arg3)
  *
@@ -117,178 +160,318 @@
  * Output Bounds:
  *   out1: [0x0 ~> 0xffffffff]
  */
-static void fiat_25519_cmovznz_u32(uint32_t* out1, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  fiat_25519_uint1 x1 = (!(!arg1));
-  uint32_t x2 = ((fiat_25519_int1)(0x0 - x1) & UINT32_C(0xffffffff));
-  // Note this line has been patched from the synthesized code to add value
-  // barriers.
-  //
-  // Clang recognizes this pattern as a select. While it usually transforms it
-  // to a cmov, it sometimes further transforms it into a branch, which we do
-  // not want.
-  uint32_t x3 = ((value_barrier_u32(x2) & arg3) | (value_barrier_u32(~x2) & arg2));
+static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u32(uint32_t* out1, fiat_25519_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  fiat_25519_uint1 x1;
+  uint32_t x2;
+  uint32_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_25519_int1)(0x0 - x1) & UINT32_C(0xffffffff));
+  x3 = ((fiat_25519_value_barrier_u32(x2) & arg3) | (fiat_25519_value_barrier_u32((~x2)) & arg2));
   *out1 = x3;
 }
 
 /*
  * The function fiat_25519_carry_mul multiplies two field elements and reduces the result.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 * eval arg2) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
- *   arg2: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
  */
-static void fiat_25519_carry_mul(uint32_t out1[10], const uint32_t arg1[10], const uint32_t arg2[10]) {
-  uint64_t x1 = ((uint64_t)(arg1[9]) * ((arg2[9]) * UINT8_C(0x26)));
-  uint64_t x2 = ((uint64_t)(arg1[9]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x3 = ((uint64_t)(arg1[9]) * ((arg2[7]) * UINT8_C(0x26)));
-  uint64_t x4 = ((uint64_t)(arg1[9]) * ((arg2[6]) * UINT8_C(0x13)));
-  uint64_t x5 = ((uint64_t)(arg1[9]) * ((arg2[5]) * UINT8_C(0x26)));
-  uint64_t x6 = ((uint64_t)(arg1[9]) * ((arg2[4]) * UINT8_C(0x13)));
-  uint64_t x7 = ((uint64_t)(arg1[9]) * ((arg2[3]) * UINT8_C(0x26)));
-  uint64_t x8 = ((uint64_t)(arg1[9]) * ((arg2[2]) * UINT8_C(0x13)));
-  uint64_t x9 = ((uint64_t)(arg1[9]) * ((arg2[1]) * UINT8_C(0x26)));
-  uint64_t x10 = ((uint64_t)(arg1[8]) * ((arg2[9]) * UINT8_C(0x13)));
-  uint64_t x11 = ((uint64_t)(arg1[8]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x12 = ((uint64_t)(arg1[8]) * ((arg2[7]) * UINT8_C(0x13)));
-  uint64_t x13 = ((uint64_t)(arg1[8]) * ((arg2[6]) * UINT8_C(0x13)));
-  uint64_t x14 = ((uint64_t)(arg1[8]) * ((arg2[5]) * UINT8_C(0x13)));
-  uint64_t x15 = ((uint64_t)(arg1[8]) * ((arg2[4]) * UINT8_C(0x13)));
-  uint64_t x16 = ((uint64_t)(arg1[8]) * ((arg2[3]) * UINT8_C(0x13)));
-  uint64_t x17 = ((uint64_t)(arg1[8]) * ((arg2[2]) * UINT8_C(0x13)));
-  uint64_t x18 = ((uint64_t)(arg1[7]) * ((arg2[9]) * UINT8_C(0x26)));
-  uint64_t x19 = ((uint64_t)(arg1[7]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x20 = ((uint64_t)(arg1[7]) * ((arg2[7]) * UINT8_C(0x26)));
-  uint64_t x21 = ((uint64_t)(arg1[7]) * ((arg2[6]) * UINT8_C(0x13)));
-  uint64_t x22 = ((uint64_t)(arg1[7]) * ((arg2[5]) * UINT8_C(0x26)));
-  uint64_t x23 = ((uint64_t)(arg1[7]) * ((arg2[4]) * UINT8_C(0x13)));
-  uint64_t x24 = ((uint64_t)(arg1[7]) * ((arg2[3]) * UINT8_C(0x26)));
-  uint64_t x25 = ((uint64_t)(arg1[6]) * ((arg2[9]) * UINT8_C(0x13)));
-  uint64_t x26 = ((uint64_t)(arg1[6]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x27 = ((uint64_t)(arg1[6]) * ((arg2[7]) * UINT8_C(0x13)));
-  uint64_t x28 = ((uint64_t)(arg1[6]) * ((arg2[6]) * UINT8_C(0x13)));
-  uint64_t x29 = ((uint64_t)(arg1[6]) * ((arg2[5]) * UINT8_C(0x13)));
-  uint64_t x30 = ((uint64_t)(arg1[6]) * ((arg2[4]) * UINT8_C(0x13)));
-  uint64_t x31 = ((uint64_t)(arg1[5]) * ((arg2[9]) * UINT8_C(0x26)));
-  uint64_t x32 = ((uint64_t)(arg1[5]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x33 = ((uint64_t)(arg1[5]) * ((arg2[7]) * UINT8_C(0x26)));
-  uint64_t x34 = ((uint64_t)(arg1[5]) * ((arg2[6]) * UINT8_C(0x13)));
-  uint64_t x35 = ((uint64_t)(arg1[5]) * ((arg2[5]) * UINT8_C(0x26)));
-  uint64_t x36 = ((uint64_t)(arg1[4]) * ((arg2[9]) * UINT8_C(0x13)));
-  uint64_t x37 = ((uint64_t)(arg1[4]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x38 = ((uint64_t)(arg1[4]) * ((arg2[7]) * UINT8_C(0x13)));
-  uint64_t x39 = ((uint64_t)(arg1[4]) * ((arg2[6]) * UINT8_C(0x13)));
-  uint64_t x40 = ((uint64_t)(arg1[3]) * ((arg2[9]) * UINT8_C(0x26)));
-  uint64_t x41 = ((uint64_t)(arg1[3]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x42 = ((uint64_t)(arg1[3]) * ((arg2[7]) * UINT8_C(0x26)));
-  uint64_t x43 = ((uint64_t)(arg1[2]) * ((arg2[9]) * UINT8_C(0x13)));
-  uint64_t x44 = ((uint64_t)(arg1[2]) * ((arg2[8]) * UINT8_C(0x13)));
-  uint64_t x45 = ((uint64_t)(arg1[1]) * ((arg2[9]) * UINT8_C(0x26)));
-  uint64_t x46 = ((uint64_t)(arg1[9]) * (arg2[0]));
-  uint64_t x47 = ((uint64_t)(arg1[8]) * (arg2[1]));
-  uint64_t x48 = ((uint64_t)(arg1[8]) * (arg2[0]));
-  uint64_t x49 = ((uint64_t)(arg1[7]) * (arg2[2]));
-  uint64_t x50 = ((uint64_t)(arg1[7]) * ((arg2[1]) * 0x2));
-  uint64_t x51 = ((uint64_t)(arg1[7]) * (arg2[0]));
-  uint64_t x52 = ((uint64_t)(arg1[6]) * (arg2[3]));
-  uint64_t x53 = ((uint64_t)(arg1[6]) * (arg2[2]));
-  uint64_t x54 = ((uint64_t)(arg1[6]) * (arg2[1]));
-  uint64_t x55 = ((uint64_t)(arg1[6]) * (arg2[0]));
-  uint64_t x56 = ((uint64_t)(arg1[5]) * (arg2[4]));
-  uint64_t x57 = ((uint64_t)(arg1[5]) * ((arg2[3]) * 0x2));
-  uint64_t x58 = ((uint64_t)(arg1[5]) * (arg2[2]));
-  uint64_t x59 = ((uint64_t)(arg1[5]) * ((arg2[1]) * 0x2));
-  uint64_t x60 = ((uint64_t)(arg1[5]) * (arg2[0]));
-  uint64_t x61 = ((uint64_t)(arg1[4]) * (arg2[5]));
-  uint64_t x62 = ((uint64_t)(arg1[4]) * (arg2[4]));
-  uint64_t x63 = ((uint64_t)(arg1[4]) * (arg2[3]));
-  uint64_t x64 = ((uint64_t)(arg1[4]) * (arg2[2]));
-  uint64_t x65 = ((uint64_t)(arg1[4]) * (arg2[1]));
-  uint64_t x66 = ((uint64_t)(arg1[4]) * (arg2[0]));
-  uint64_t x67 = ((uint64_t)(arg1[3]) * (arg2[6]));
-  uint64_t x68 = ((uint64_t)(arg1[3]) * ((arg2[5]) * 0x2));
-  uint64_t x69 = ((uint64_t)(arg1[3]) * (arg2[4]));
-  uint64_t x70 = ((uint64_t)(arg1[3]) * ((arg2[3]) * 0x2));
-  uint64_t x71 = ((uint64_t)(arg1[3]) * (arg2[2]));
-  uint64_t x72 = ((uint64_t)(arg1[3]) * ((arg2[1]) * 0x2));
-  uint64_t x73 = ((uint64_t)(arg1[3]) * (arg2[0]));
-  uint64_t x74 = ((uint64_t)(arg1[2]) * (arg2[7]));
-  uint64_t x75 = ((uint64_t)(arg1[2]) * (arg2[6]));
-  uint64_t x76 = ((uint64_t)(arg1[2]) * (arg2[5]));
-  uint64_t x77 = ((uint64_t)(arg1[2]) * (arg2[4]));
-  uint64_t x78 = ((uint64_t)(arg1[2]) * (arg2[3]));
-  uint64_t x79 = ((uint64_t)(arg1[2]) * (arg2[2]));
-  uint64_t x80 = ((uint64_t)(arg1[2]) * (arg2[1]));
-  uint64_t x81 = ((uint64_t)(arg1[2]) * (arg2[0]));
-  uint64_t x82 = ((uint64_t)(arg1[1]) * (arg2[8]));
-  uint64_t x83 = ((uint64_t)(arg1[1]) * ((arg2[7]) * 0x2));
-  uint64_t x84 = ((uint64_t)(arg1[1]) * (arg2[6]));
-  uint64_t x85 = ((uint64_t)(arg1[1]) * ((arg2[5]) * 0x2));
-  uint64_t x86 = ((uint64_t)(arg1[1]) * (arg2[4]));
-  uint64_t x87 = ((uint64_t)(arg1[1]) * ((arg2[3]) * 0x2));
-  uint64_t x88 = ((uint64_t)(arg1[1]) * (arg2[2]));
-  uint64_t x89 = ((uint64_t)(arg1[1]) * ((arg2[1]) * 0x2));
-  uint64_t x90 = ((uint64_t)(arg1[1]) * (arg2[0]));
-  uint64_t x91 = ((uint64_t)(arg1[0]) * (arg2[9]));
-  uint64_t x92 = ((uint64_t)(arg1[0]) * (arg2[8]));
-  uint64_t x93 = ((uint64_t)(arg1[0]) * (arg2[7]));
-  uint64_t x94 = ((uint64_t)(arg1[0]) * (arg2[6]));
-  uint64_t x95 = ((uint64_t)(arg1[0]) * (arg2[5]));
-  uint64_t x96 = ((uint64_t)(arg1[0]) * (arg2[4]));
-  uint64_t x97 = ((uint64_t)(arg1[0]) * (arg2[3]));
-  uint64_t x98 = ((uint64_t)(arg1[0]) * (arg2[2]));
-  uint64_t x99 = ((uint64_t)(arg1[0]) * (arg2[1]));
-  uint64_t x100 = ((uint64_t)(arg1[0]) * (arg2[0]));
-  uint64_t x101 = (x100 + (x45 + (x44 + (x42 + (x39 + (x35 + (x30 + (x24 + (x17 + x9)))))))));
-  uint64_t x102 = (x101 >> 26);
-  uint32_t x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
-  uint64_t x104 = (x91 + (x82 + (x74 + (x67 + (x61 + (x56 + (x52 + (x49 + (x47 + x46)))))))));
-  uint64_t x105 = (x92 + (x83 + (x75 + (x68 + (x62 + (x57 + (x53 + (x50 + (x48 + x1)))))))));
-  uint64_t x106 = (x93 + (x84 + (x76 + (x69 + (x63 + (x58 + (x54 + (x51 + (x10 + x2)))))))));
-  uint64_t x107 = (x94 + (x85 + (x77 + (x70 + (x64 + (x59 + (x55 + (x18 + (x11 + x3)))))))));
-  uint64_t x108 = (x95 + (x86 + (x78 + (x71 + (x65 + (x60 + (x25 + (x19 + (x12 + x4)))))))));
-  uint64_t x109 = (x96 + (x87 + (x79 + (x72 + (x66 + (x31 + (x26 + (x20 + (x13 + x5)))))))));
-  uint64_t x110 = (x97 + (x88 + (x80 + (x73 + (x36 + (x32 + (x27 + (x21 + (x14 + x6)))))))));
-  uint64_t x111 = (x98 + (x89 + (x81 + (x40 + (x37 + (x33 + (x28 + (x22 + (x15 + x7)))))))));
-  uint64_t x112 = (x99 + (x90 + (x43 + (x41 + (x38 + (x34 + (x29 + (x23 + (x16 + x8)))))))));
-  uint64_t x113 = (x102 + x112);
-  uint64_t x114 = (x113 >> 25);
-  uint32_t x115 = (uint32_t)(x113 & UINT32_C(0x1ffffff));
-  uint64_t x116 = (x114 + x111);
-  uint64_t x117 = (x116 >> 26);
-  uint32_t x118 = (uint32_t)(x116 & UINT32_C(0x3ffffff));
-  uint64_t x119 = (x117 + x110);
-  uint64_t x120 = (x119 >> 25);
-  uint32_t x121 = (uint32_t)(x119 & UINT32_C(0x1ffffff));
-  uint64_t x122 = (x120 + x109);
-  uint64_t x123 = (x122 >> 26);
-  uint32_t x124 = (uint32_t)(x122 & UINT32_C(0x3ffffff));
-  uint64_t x125 = (x123 + x108);
-  uint64_t x126 = (x125 >> 25);
-  uint32_t x127 = (uint32_t)(x125 & UINT32_C(0x1ffffff));
-  uint64_t x128 = (x126 + x107);
-  uint64_t x129 = (x128 >> 26);
-  uint32_t x130 = (uint32_t)(x128 & UINT32_C(0x3ffffff));
-  uint64_t x131 = (x129 + x106);
-  uint64_t x132 = (x131 >> 25);
-  uint32_t x133 = (uint32_t)(x131 & UINT32_C(0x1ffffff));
-  uint64_t x134 = (x132 + x105);
-  uint64_t x135 = (x134 >> 26);
-  uint32_t x136 = (uint32_t)(x134 & UINT32_C(0x3ffffff));
-  uint64_t x137 = (x135 + x104);
-  uint64_t x138 = (x137 >> 25);
-  uint32_t x139 = (uint32_t)(x137 & UINT32_C(0x1ffffff));
-  uint64_t x140 = (x138 * UINT8_C(0x13));
-  uint64_t x141 = (x103 + x140);
-  uint32_t x142 = (uint32_t)(x141 >> 26);
-  uint32_t x143 = (uint32_t)(x141 & UINT32_C(0x3ffffff));
-  uint32_t x144 = (x142 + x115);
-  fiat_25519_uint1 x145 = (fiat_25519_uint1)(x144 >> 25);
-  uint32_t x146 = (x144 & UINT32_C(0x1ffffff));
-  uint32_t x147 = (x145 + x118);
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint64_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint64_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  uint64_t x71;
+  uint64_t x72;
+  uint64_t x73;
+  uint64_t x74;
+  uint64_t x75;
+  uint64_t x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  uint64_t x81;
+  uint64_t x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint64_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  uint64_t x93;
+  uint64_t x94;
+  uint64_t x95;
+  uint64_t x96;
+  uint64_t x97;
+  uint64_t x98;
+  uint64_t x99;
+  uint64_t x100;
+  uint64_t x101;
+  uint64_t x102;
+  uint32_t x103;
+  uint64_t x104;
+  uint64_t x105;
+  uint64_t x106;
+  uint64_t x107;
+  uint64_t x108;
+  uint64_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint64_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint32_t x115;
+  uint64_t x116;
+  uint64_t x117;
+  uint32_t x118;
+  uint64_t x119;
+  uint64_t x120;
+  uint32_t x121;
+  uint64_t x122;
+  uint64_t x123;
+  uint32_t x124;
+  uint64_t x125;
+  uint64_t x126;
+  uint32_t x127;
+  uint64_t x128;
+  uint64_t x129;
+  uint32_t x130;
+  uint64_t x131;
+  uint64_t x132;
+  uint32_t x133;
+  uint64_t x134;
+  uint64_t x135;
+  uint32_t x136;
+  uint64_t x137;
+  uint64_t x138;
+  uint32_t x139;
+  uint64_t x140;
+  uint64_t x141;
+  uint32_t x142;
+  uint32_t x143;
+  uint32_t x144;
+  fiat_25519_uint1 x145;
+  uint32_t x146;
+  uint32_t x147;
+  x1 = ((uint64_t)(arg1[9]) * ((arg2[9]) * UINT8_C(0x26)));
+  x2 = ((uint64_t)(arg1[9]) * ((arg2[8]) * UINT8_C(0x13)));
+  x3 = ((uint64_t)(arg1[9]) * ((arg2[7]) * UINT8_C(0x26)));
+  x4 = ((uint64_t)(arg1[9]) * ((arg2[6]) * UINT8_C(0x13)));
+  x5 = ((uint64_t)(arg1[9]) * ((arg2[5]) * UINT8_C(0x26)));
+  x6 = ((uint64_t)(arg1[9]) * ((arg2[4]) * UINT8_C(0x13)));
+  x7 = ((uint64_t)(arg1[9]) * ((arg2[3]) * UINT8_C(0x26)));
+  x8 = ((uint64_t)(arg1[9]) * ((arg2[2]) * UINT8_C(0x13)));
+  x9 = ((uint64_t)(arg1[9]) * ((arg2[1]) * UINT8_C(0x26)));
+  x10 = ((uint64_t)(arg1[8]) * ((arg2[9]) * UINT8_C(0x13)));
+  x11 = ((uint64_t)(arg1[8]) * ((arg2[8]) * UINT8_C(0x13)));
+  x12 = ((uint64_t)(arg1[8]) * ((arg2[7]) * UINT8_C(0x13)));
+  x13 = ((uint64_t)(arg1[8]) * ((arg2[6]) * UINT8_C(0x13)));
+  x14 = ((uint64_t)(arg1[8]) * ((arg2[5]) * UINT8_C(0x13)));
+  x15 = ((uint64_t)(arg1[8]) * ((arg2[4]) * UINT8_C(0x13)));
+  x16 = ((uint64_t)(arg1[8]) * ((arg2[3]) * UINT8_C(0x13)));
+  x17 = ((uint64_t)(arg1[8]) * ((arg2[2]) * UINT8_C(0x13)));
+  x18 = ((uint64_t)(arg1[7]) * ((arg2[9]) * UINT8_C(0x26)));
+  x19 = ((uint64_t)(arg1[7]) * ((arg2[8]) * UINT8_C(0x13)));
+  x20 = ((uint64_t)(arg1[7]) * ((arg2[7]) * UINT8_C(0x26)));
+  x21 = ((uint64_t)(arg1[7]) * ((arg2[6]) * UINT8_C(0x13)));
+  x22 = ((uint64_t)(arg1[7]) * ((arg2[5]) * UINT8_C(0x26)));
+  x23 = ((uint64_t)(arg1[7]) * ((arg2[4]) * UINT8_C(0x13)));
+  x24 = ((uint64_t)(arg1[7]) * ((arg2[3]) * UINT8_C(0x26)));
+  x25 = ((uint64_t)(arg1[6]) * ((arg2[9]) * UINT8_C(0x13)));
+  x26 = ((uint64_t)(arg1[6]) * ((arg2[8]) * UINT8_C(0x13)));
+  x27 = ((uint64_t)(arg1[6]) * ((arg2[7]) * UINT8_C(0x13)));
+  x28 = ((uint64_t)(arg1[6]) * ((arg2[6]) * UINT8_C(0x13)));
+  x29 = ((uint64_t)(arg1[6]) * ((arg2[5]) * UINT8_C(0x13)));
+  x30 = ((uint64_t)(arg1[6]) * ((arg2[4]) * UINT8_C(0x13)));
+  x31 = ((uint64_t)(arg1[5]) * ((arg2[9]) * UINT8_C(0x26)));
+  x32 = ((uint64_t)(arg1[5]) * ((arg2[8]) * UINT8_C(0x13)));
+  x33 = ((uint64_t)(arg1[5]) * ((arg2[7]) * UINT8_C(0x26)));
+  x34 = ((uint64_t)(arg1[5]) * ((arg2[6]) * UINT8_C(0x13)));
+  x35 = ((uint64_t)(arg1[5]) * ((arg2[5]) * UINT8_C(0x26)));
+  x36 = ((uint64_t)(arg1[4]) * ((arg2[9]) * UINT8_C(0x13)));
+  x37 = ((uint64_t)(arg1[4]) * ((arg2[8]) * UINT8_C(0x13)));
+  x38 = ((uint64_t)(arg1[4]) * ((arg2[7]) * UINT8_C(0x13)));
+  x39 = ((uint64_t)(arg1[4]) * ((arg2[6]) * UINT8_C(0x13)));
+  x40 = ((uint64_t)(arg1[3]) * ((arg2[9]) * UINT8_C(0x26)));
+  x41 = ((uint64_t)(arg1[3]) * ((arg2[8]) * UINT8_C(0x13)));
+  x42 = ((uint64_t)(arg1[3]) * ((arg2[7]) * UINT8_C(0x26)));
+  x43 = ((uint64_t)(arg1[2]) * ((arg2[9]) * UINT8_C(0x13)));
+  x44 = ((uint64_t)(arg1[2]) * ((arg2[8]) * UINT8_C(0x13)));
+  x45 = ((uint64_t)(arg1[1]) * ((arg2[9]) * UINT8_C(0x26)));
+  x46 = ((uint64_t)(arg1[9]) * (arg2[0]));
+  x47 = ((uint64_t)(arg1[8]) * (arg2[1]));
+  x48 = ((uint64_t)(arg1[8]) * (arg2[0]));
+  x49 = ((uint64_t)(arg1[7]) * (arg2[2]));
+  x50 = ((uint64_t)(arg1[7]) * ((arg2[1]) * 0x2));
+  x51 = ((uint64_t)(arg1[7]) * (arg2[0]));
+  x52 = ((uint64_t)(arg1[6]) * (arg2[3]));
+  x53 = ((uint64_t)(arg1[6]) * (arg2[2]));
+  x54 = ((uint64_t)(arg1[6]) * (arg2[1]));
+  x55 = ((uint64_t)(arg1[6]) * (arg2[0]));
+  x56 = ((uint64_t)(arg1[5]) * (arg2[4]));
+  x57 = ((uint64_t)(arg1[5]) * ((arg2[3]) * 0x2));
+  x58 = ((uint64_t)(arg1[5]) * (arg2[2]));
+  x59 = ((uint64_t)(arg1[5]) * ((arg2[1]) * 0x2));
+  x60 = ((uint64_t)(arg1[5]) * (arg2[0]));
+  x61 = ((uint64_t)(arg1[4]) * (arg2[5]));
+  x62 = ((uint64_t)(arg1[4]) * (arg2[4]));
+  x63 = ((uint64_t)(arg1[4]) * (arg2[3]));
+  x64 = ((uint64_t)(arg1[4]) * (arg2[2]));
+  x65 = ((uint64_t)(arg1[4]) * (arg2[1]));
+  x66 = ((uint64_t)(arg1[4]) * (arg2[0]));
+  x67 = ((uint64_t)(arg1[3]) * (arg2[6]));
+  x68 = ((uint64_t)(arg1[3]) * ((arg2[5]) * 0x2));
+  x69 = ((uint64_t)(arg1[3]) * (arg2[4]));
+  x70 = ((uint64_t)(arg1[3]) * ((arg2[3]) * 0x2));
+  x71 = ((uint64_t)(arg1[3]) * (arg2[2]));
+  x72 = ((uint64_t)(arg1[3]) * ((arg2[1]) * 0x2));
+  x73 = ((uint64_t)(arg1[3]) * (arg2[0]));
+  x74 = ((uint64_t)(arg1[2]) * (arg2[7]));
+  x75 = ((uint64_t)(arg1[2]) * (arg2[6]));
+  x76 = ((uint64_t)(arg1[2]) * (arg2[5]));
+  x77 = ((uint64_t)(arg1[2]) * (arg2[4]));
+  x78 = ((uint64_t)(arg1[2]) * (arg2[3]));
+  x79 = ((uint64_t)(arg1[2]) * (arg2[2]));
+  x80 = ((uint64_t)(arg1[2]) * (arg2[1]));
+  x81 = ((uint64_t)(arg1[2]) * (arg2[0]));
+  x82 = ((uint64_t)(arg1[1]) * (arg2[8]));
+  x83 = ((uint64_t)(arg1[1]) * ((arg2[7]) * 0x2));
+  x84 = ((uint64_t)(arg1[1]) * (arg2[6]));
+  x85 = ((uint64_t)(arg1[1]) * ((arg2[5]) * 0x2));
+  x86 = ((uint64_t)(arg1[1]) * (arg2[4]));
+  x87 = ((uint64_t)(arg1[1]) * ((arg2[3]) * 0x2));
+  x88 = ((uint64_t)(arg1[1]) * (arg2[2]));
+  x89 = ((uint64_t)(arg1[1]) * ((arg2[1]) * 0x2));
+  x90 = ((uint64_t)(arg1[1]) * (arg2[0]));
+  x91 = ((uint64_t)(arg1[0]) * (arg2[9]));
+  x92 = ((uint64_t)(arg1[0]) * (arg2[8]));
+  x93 = ((uint64_t)(arg1[0]) * (arg2[7]));
+  x94 = ((uint64_t)(arg1[0]) * (arg2[6]));
+  x95 = ((uint64_t)(arg1[0]) * (arg2[5]));
+  x96 = ((uint64_t)(arg1[0]) * (arg2[4]));
+  x97 = ((uint64_t)(arg1[0]) * (arg2[3]));
+  x98 = ((uint64_t)(arg1[0]) * (arg2[2]));
+  x99 = ((uint64_t)(arg1[0]) * (arg2[1]));
+  x100 = ((uint64_t)(arg1[0]) * (arg2[0]));
+  x101 = (x100 + (x45 + (x44 + (x42 + (x39 + (x35 + (x30 + (x24 + (x17 + x9)))))))));
+  x102 = (x101 >> 26);
+  x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
+  x104 = (x91 + (x82 + (x74 + (x67 + (x61 + (x56 + (x52 + (x49 + (x47 + x46)))))))));
+  x105 = (x92 + (x83 + (x75 + (x68 + (x62 + (x57 + (x53 + (x50 + (x48 + x1)))))))));
+  x106 = (x93 + (x84 + (x76 + (x69 + (x63 + (x58 + (x54 + (x51 + (x10 + x2)))))))));
+  x107 = (x94 + (x85 + (x77 + (x70 + (x64 + (x59 + (x55 + (x18 + (x11 + x3)))))))));
+  x108 = (x95 + (x86 + (x78 + (x71 + (x65 + (x60 + (x25 + (x19 + (x12 + x4)))))))));
+  x109 = (x96 + (x87 + (x79 + (x72 + (x66 + (x31 + (x26 + (x20 + (x13 + x5)))))))));
+  x110 = (x97 + (x88 + (x80 + (x73 + (x36 + (x32 + (x27 + (x21 + (x14 + x6)))))))));
+  x111 = (x98 + (x89 + (x81 + (x40 + (x37 + (x33 + (x28 + (x22 + (x15 + x7)))))))));
+  x112 = (x99 + (x90 + (x43 + (x41 + (x38 + (x34 + (x29 + (x23 + (x16 + x8)))))))));
+  x113 = (x102 + x112);
+  x114 = (x113 >> 25);
+  x115 = (uint32_t)(x113 & UINT32_C(0x1ffffff));
+  x116 = (x114 + x111);
+  x117 = (x116 >> 26);
+  x118 = (uint32_t)(x116 & UINT32_C(0x3ffffff));
+  x119 = (x117 + x110);
+  x120 = (x119 >> 25);
+  x121 = (uint32_t)(x119 & UINT32_C(0x1ffffff));
+  x122 = (x120 + x109);
+  x123 = (x122 >> 26);
+  x124 = (uint32_t)(x122 & UINT32_C(0x3ffffff));
+  x125 = (x123 + x108);
+  x126 = (x125 >> 25);
+  x127 = (uint32_t)(x125 & UINT32_C(0x1ffffff));
+  x128 = (x126 + x107);
+  x129 = (x128 >> 26);
+  x130 = (uint32_t)(x128 & UINT32_C(0x3ffffff));
+  x131 = (x129 + x106);
+  x132 = (x131 >> 25);
+  x133 = (uint32_t)(x131 & UINT32_C(0x1ffffff));
+  x134 = (x132 + x105);
+  x135 = (x134 >> 26);
+  x136 = (uint32_t)(x134 & UINT32_C(0x3ffffff));
+  x137 = (x135 + x104);
+  x138 = (x137 >> 25);
+  x139 = (uint32_t)(x137 & UINT32_C(0x1ffffff));
+  x140 = (x138 * UINT8_C(0x13));
+  x141 = (x103 + x140);
+  x142 = (uint32_t)(x141 >> 26);
+  x143 = (uint32_t)(x141 & UINT32_C(0x3ffffff));
+  x144 = (x142 + x115);
+  x145 = (fiat_25519_uint1)(x144 >> 25);
+  x146 = (x144 & UINT32_C(0x1ffffff));
+  x147 = (x145 + x118);
   out1[0] = x143;
   out1[1] = x146;
   out1[2] = x147;
@@ -303,135 +486,252 @@
 
 /*
  * The function fiat_25519_carry_square squares a field element and reduces the result.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 * eval arg1) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
  */
-static void fiat_25519_carry_square(uint32_t out1[10], const uint32_t arg1[10]) {
-  uint32_t x1 = ((arg1[9]) * UINT8_C(0x13));
-  uint32_t x2 = (x1 * 0x2);
-  uint32_t x3 = ((arg1[9]) * 0x2);
-  uint32_t x4 = ((arg1[8]) * UINT8_C(0x13));
-  uint64_t x5 = ((uint64_t)x4 * 0x2);
-  uint32_t x6 = ((arg1[8]) * 0x2);
-  uint32_t x7 = ((arg1[7]) * UINT8_C(0x13));
-  uint32_t x8 = (x7 * 0x2);
-  uint32_t x9 = ((arg1[7]) * 0x2);
-  uint32_t x10 = ((arg1[6]) * UINT8_C(0x13));
-  uint64_t x11 = ((uint64_t)x10 * 0x2);
-  uint32_t x12 = ((arg1[6]) * 0x2);
-  uint32_t x13 = ((arg1[5]) * UINT8_C(0x13));
-  uint32_t x14 = ((arg1[5]) * 0x2);
-  uint32_t x15 = ((arg1[4]) * 0x2);
-  uint32_t x16 = ((arg1[3]) * 0x2);
-  uint32_t x17 = ((arg1[2]) * 0x2);
-  uint32_t x18 = ((arg1[1]) * 0x2);
-  uint64_t x19 = ((uint64_t)(arg1[9]) * (x1 * 0x2));
-  uint64_t x20 = ((uint64_t)(arg1[8]) * x2);
-  uint64_t x21 = ((uint64_t)(arg1[8]) * x4);
-  uint64_t x22 = ((arg1[7]) * ((uint64_t)x2 * 0x2));
-  uint64_t x23 = ((arg1[7]) * x5);
-  uint64_t x24 = ((uint64_t)(arg1[7]) * (x7 * 0x2));
-  uint64_t x25 = ((uint64_t)(arg1[6]) * x2);
-  uint64_t x26 = ((arg1[6]) * x5);
-  uint64_t x27 = ((uint64_t)(arg1[6]) * x8);
-  uint64_t x28 = ((uint64_t)(arg1[6]) * x10);
-  uint64_t x29 = ((arg1[5]) * ((uint64_t)x2 * 0x2));
-  uint64_t x30 = ((arg1[5]) * x5);
-  uint64_t x31 = ((arg1[5]) * ((uint64_t)x8 * 0x2));
-  uint64_t x32 = ((arg1[5]) * x11);
-  uint64_t x33 = ((uint64_t)(arg1[5]) * (x13 * 0x2));
-  uint64_t x34 = ((uint64_t)(arg1[4]) * x2);
-  uint64_t x35 = ((arg1[4]) * x5);
-  uint64_t x36 = ((uint64_t)(arg1[4]) * x8);
-  uint64_t x37 = ((arg1[4]) * x11);
-  uint64_t x38 = ((uint64_t)(arg1[4]) * x14);
-  uint64_t x39 = ((uint64_t)(arg1[4]) * (arg1[4]));
-  uint64_t x40 = ((arg1[3]) * ((uint64_t)x2 * 0x2));
-  uint64_t x41 = ((arg1[3]) * x5);
-  uint64_t x42 = ((arg1[3]) * ((uint64_t)x8 * 0x2));
-  uint64_t x43 = ((uint64_t)(arg1[3]) * x12);
-  uint64_t x44 = ((uint64_t)(arg1[3]) * (x14 * 0x2));
-  uint64_t x45 = ((uint64_t)(arg1[3]) * x15);
-  uint64_t x46 = ((uint64_t)(arg1[3]) * ((arg1[3]) * 0x2));
-  uint64_t x47 = ((uint64_t)(arg1[2]) * x2);
-  uint64_t x48 = ((arg1[2]) * x5);
-  uint64_t x49 = ((uint64_t)(arg1[2]) * x9);
-  uint64_t x50 = ((uint64_t)(arg1[2]) * x12);
-  uint64_t x51 = ((uint64_t)(arg1[2]) * x14);
-  uint64_t x52 = ((uint64_t)(arg1[2]) * x15);
-  uint64_t x53 = ((uint64_t)(arg1[2]) * x16);
-  uint64_t x54 = ((uint64_t)(arg1[2]) * (arg1[2]));
-  uint64_t x55 = ((arg1[1]) * ((uint64_t)x2 * 0x2));
-  uint64_t x56 = ((uint64_t)(arg1[1]) * x6);
-  uint64_t x57 = ((uint64_t)(arg1[1]) * (x9 * 0x2));
-  uint64_t x58 = ((uint64_t)(arg1[1]) * x12);
-  uint64_t x59 = ((uint64_t)(arg1[1]) * (x14 * 0x2));
-  uint64_t x60 = ((uint64_t)(arg1[1]) * x15);
-  uint64_t x61 = ((uint64_t)(arg1[1]) * (x16 * 0x2));
-  uint64_t x62 = ((uint64_t)(arg1[1]) * x17);
-  uint64_t x63 = ((uint64_t)(arg1[1]) * ((arg1[1]) * 0x2));
-  uint64_t x64 = ((uint64_t)(arg1[0]) * x3);
-  uint64_t x65 = ((uint64_t)(arg1[0]) * x6);
-  uint64_t x66 = ((uint64_t)(arg1[0]) * x9);
-  uint64_t x67 = ((uint64_t)(arg1[0]) * x12);
-  uint64_t x68 = ((uint64_t)(arg1[0]) * x14);
-  uint64_t x69 = ((uint64_t)(arg1[0]) * x15);
-  uint64_t x70 = ((uint64_t)(arg1[0]) * x16);
-  uint64_t x71 = ((uint64_t)(arg1[0]) * x17);
-  uint64_t x72 = ((uint64_t)(arg1[0]) * x18);
-  uint64_t x73 = ((uint64_t)(arg1[0]) * (arg1[0]));
-  uint64_t x74 = (x73 + (x55 + (x48 + (x42 + (x37 + x33)))));
-  uint64_t x75 = (x74 >> 26);
-  uint32_t x76 = (uint32_t)(x74 & UINT32_C(0x3ffffff));
-  uint64_t x77 = (x64 + (x56 + (x49 + (x43 + x38))));
-  uint64_t x78 = (x65 + (x57 + (x50 + (x44 + (x39 + x19)))));
-  uint64_t x79 = (x66 + (x58 + (x51 + (x45 + x20))));
-  uint64_t x80 = (x67 + (x59 + (x52 + (x46 + (x22 + x21)))));
-  uint64_t x81 = (x68 + (x60 + (x53 + (x25 + x23))));
-  uint64_t x82 = (x69 + (x61 + (x54 + (x29 + (x26 + x24)))));
-  uint64_t x83 = (x70 + (x62 + (x34 + (x30 + x27))));
-  uint64_t x84 = (x71 + (x63 + (x40 + (x35 + (x31 + x28)))));
-  uint64_t x85 = (x72 + (x47 + (x41 + (x36 + x32))));
-  uint64_t x86 = (x75 + x85);
-  uint64_t x87 = (x86 >> 25);
-  uint32_t x88 = (uint32_t)(x86 & UINT32_C(0x1ffffff));
-  uint64_t x89 = (x87 + x84);
-  uint64_t x90 = (x89 >> 26);
-  uint32_t x91 = (uint32_t)(x89 & UINT32_C(0x3ffffff));
-  uint64_t x92 = (x90 + x83);
-  uint64_t x93 = (x92 >> 25);
-  uint32_t x94 = (uint32_t)(x92 & UINT32_C(0x1ffffff));
-  uint64_t x95 = (x93 + x82);
-  uint64_t x96 = (x95 >> 26);
-  uint32_t x97 = (uint32_t)(x95 & UINT32_C(0x3ffffff));
-  uint64_t x98 = (x96 + x81);
-  uint64_t x99 = (x98 >> 25);
-  uint32_t x100 = (uint32_t)(x98 & UINT32_C(0x1ffffff));
-  uint64_t x101 = (x99 + x80);
-  uint64_t x102 = (x101 >> 26);
-  uint32_t x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
-  uint64_t x104 = (x102 + x79);
-  uint64_t x105 = (x104 >> 25);
-  uint32_t x106 = (uint32_t)(x104 & UINT32_C(0x1ffffff));
-  uint64_t x107 = (x105 + x78);
-  uint64_t x108 = (x107 >> 26);
-  uint32_t x109 = (uint32_t)(x107 & UINT32_C(0x3ffffff));
-  uint64_t x110 = (x108 + x77);
-  uint64_t x111 = (x110 >> 25);
-  uint32_t x112 = (uint32_t)(x110 & UINT32_C(0x1ffffff));
-  uint64_t x113 = (x111 * UINT8_C(0x13));
-  uint64_t x114 = (x76 + x113);
-  uint32_t x115 = (uint32_t)(x114 >> 26);
-  uint32_t x116 = (uint32_t)(x114 & UINT32_C(0x3ffffff));
-  uint32_t x117 = (x115 + x88);
-  fiat_25519_uint1 x118 = (fiat_25519_uint1)(x117 >> 25);
-  uint32_t x119 = (x117 & UINT32_C(0x1ffffff));
-  uint32_t x120 = (x118 + x91);
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint64_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint64_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint64_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint64_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint64_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  uint64_t x71;
+  uint64_t x72;
+  uint64_t x73;
+  uint64_t x74;
+  uint64_t x75;
+  uint32_t x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  uint64_t x81;
+  uint64_t x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  uint64_t x86;
+  uint64_t x87;
+  uint32_t x88;
+  uint64_t x89;
+  uint64_t x90;
+  uint32_t x91;
+  uint64_t x92;
+  uint64_t x93;
+  uint32_t x94;
+  uint64_t x95;
+  uint64_t x96;
+  uint32_t x97;
+  uint64_t x98;
+  uint64_t x99;
+  uint32_t x100;
+  uint64_t x101;
+  uint64_t x102;
+  uint32_t x103;
+  uint64_t x104;
+  uint64_t x105;
+  uint32_t x106;
+  uint64_t x107;
+  uint64_t x108;
+  uint32_t x109;
+  uint64_t x110;
+  uint64_t x111;
+  uint32_t x112;
+  uint64_t x113;
+  uint64_t x114;
+  uint32_t x115;
+  uint32_t x116;
+  uint32_t x117;
+  fiat_25519_uint1 x118;
+  uint32_t x119;
+  uint32_t x120;
+  x1 = ((arg1[9]) * UINT8_C(0x13));
+  x2 = (x1 * 0x2);
+  x3 = ((arg1[9]) * 0x2);
+  x4 = ((arg1[8]) * UINT8_C(0x13));
+  x5 = ((uint64_t)x4 * 0x2);
+  x6 = ((arg1[8]) * 0x2);
+  x7 = ((arg1[7]) * UINT8_C(0x13));
+  x8 = (x7 * 0x2);
+  x9 = ((arg1[7]) * 0x2);
+  x10 = ((arg1[6]) * UINT8_C(0x13));
+  x11 = ((uint64_t)x10 * 0x2);
+  x12 = ((arg1[6]) * 0x2);
+  x13 = ((arg1[5]) * UINT8_C(0x13));
+  x14 = ((arg1[5]) * 0x2);
+  x15 = ((arg1[4]) * 0x2);
+  x16 = ((arg1[3]) * 0x2);
+  x17 = ((arg1[2]) * 0x2);
+  x18 = ((arg1[1]) * 0x2);
+  x19 = ((uint64_t)(arg1[9]) * (x1 * 0x2));
+  x20 = ((uint64_t)(arg1[8]) * x2);
+  x21 = ((uint64_t)(arg1[8]) * x4);
+  x22 = ((arg1[7]) * ((uint64_t)x2 * 0x2));
+  x23 = ((arg1[7]) * x5);
+  x24 = ((uint64_t)(arg1[7]) * (x7 * 0x2));
+  x25 = ((uint64_t)(arg1[6]) * x2);
+  x26 = ((arg1[6]) * x5);
+  x27 = ((uint64_t)(arg1[6]) * x8);
+  x28 = ((uint64_t)(arg1[6]) * x10);
+  x29 = ((arg1[5]) * ((uint64_t)x2 * 0x2));
+  x30 = ((arg1[5]) * x5);
+  x31 = ((arg1[5]) * ((uint64_t)x8 * 0x2));
+  x32 = ((arg1[5]) * x11);
+  x33 = ((uint64_t)(arg1[5]) * (x13 * 0x2));
+  x34 = ((uint64_t)(arg1[4]) * x2);
+  x35 = ((arg1[4]) * x5);
+  x36 = ((uint64_t)(arg1[4]) * x8);
+  x37 = ((arg1[4]) * x11);
+  x38 = ((uint64_t)(arg1[4]) * x14);
+  x39 = ((uint64_t)(arg1[4]) * (arg1[4]));
+  x40 = ((arg1[3]) * ((uint64_t)x2 * 0x2));
+  x41 = ((arg1[3]) * x5);
+  x42 = ((arg1[3]) * ((uint64_t)x8 * 0x2));
+  x43 = ((uint64_t)(arg1[3]) * x12);
+  x44 = ((uint64_t)(arg1[3]) * (x14 * 0x2));
+  x45 = ((uint64_t)(arg1[3]) * x15);
+  x46 = ((uint64_t)(arg1[3]) * ((arg1[3]) * 0x2));
+  x47 = ((uint64_t)(arg1[2]) * x2);
+  x48 = ((arg1[2]) * x5);
+  x49 = ((uint64_t)(arg1[2]) * x9);
+  x50 = ((uint64_t)(arg1[2]) * x12);
+  x51 = ((uint64_t)(arg1[2]) * x14);
+  x52 = ((uint64_t)(arg1[2]) * x15);
+  x53 = ((uint64_t)(arg1[2]) * x16);
+  x54 = ((uint64_t)(arg1[2]) * (arg1[2]));
+  x55 = ((arg1[1]) * ((uint64_t)x2 * 0x2));
+  x56 = ((uint64_t)(arg1[1]) * x6);
+  x57 = ((uint64_t)(arg1[1]) * (x9 * 0x2));
+  x58 = ((uint64_t)(arg1[1]) * x12);
+  x59 = ((uint64_t)(arg1[1]) * (x14 * 0x2));
+  x60 = ((uint64_t)(arg1[1]) * x15);
+  x61 = ((uint64_t)(arg1[1]) * (x16 * 0x2));
+  x62 = ((uint64_t)(arg1[1]) * x17);
+  x63 = ((uint64_t)(arg1[1]) * ((arg1[1]) * 0x2));
+  x64 = ((uint64_t)(arg1[0]) * x3);
+  x65 = ((uint64_t)(arg1[0]) * x6);
+  x66 = ((uint64_t)(arg1[0]) * x9);
+  x67 = ((uint64_t)(arg1[0]) * x12);
+  x68 = ((uint64_t)(arg1[0]) * x14);
+  x69 = ((uint64_t)(arg1[0]) * x15);
+  x70 = ((uint64_t)(arg1[0]) * x16);
+  x71 = ((uint64_t)(arg1[0]) * x17);
+  x72 = ((uint64_t)(arg1[0]) * x18);
+  x73 = ((uint64_t)(arg1[0]) * (arg1[0]));
+  x74 = (x73 + (x55 + (x48 + (x42 + (x37 + x33)))));
+  x75 = (x74 >> 26);
+  x76 = (uint32_t)(x74 & UINT32_C(0x3ffffff));
+  x77 = (x64 + (x56 + (x49 + (x43 + x38))));
+  x78 = (x65 + (x57 + (x50 + (x44 + (x39 + x19)))));
+  x79 = (x66 + (x58 + (x51 + (x45 + x20))));
+  x80 = (x67 + (x59 + (x52 + (x46 + (x22 + x21)))));
+  x81 = (x68 + (x60 + (x53 + (x25 + x23))));
+  x82 = (x69 + (x61 + (x54 + (x29 + (x26 + x24)))));
+  x83 = (x70 + (x62 + (x34 + (x30 + x27))));
+  x84 = (x71 + (x63 + (x40 + (x35 + (x31 + x28)))));
+  x85 = (x72 + (x47 + (x41 + (x36 + x32))));
+  x86 = (x75 + x85);
+  x87 = (x86 >> 25);
+  x88 = (uint32_t)(x86 & UINT32_C(0x1ffffff));
+  x89 = (x87 + x84);
+  x90 = (x89 >> 26);
+  x91 = (uint32_t)(x89 & UINT32_C(0x3ffffff));
+  x92 = (x90 + x83);
+  x93 = (x92 >> 25);
+  x94 = (uint32_t)(x92 & UINT32_C(0x1ffffff));
+  x95 = (x93 + x82);
+  x96 = (x95 >> 26);
+  x97 = (uint32_t)(x95 & UINT32_C(0x3ffffff));
+  x98 = (x96 + x81);
+  x99 = (x98 >> 25);
+  x100 = (uint32_t)(x98 & UINT32_C(0x1ffffff));
+  x101 = (x99 + x80);
+  x102 = (x101 >> 26);
+  x103 = (uint32_t)(x101 & UINT32_C(0x3ffffff));
+  x104 = (x102 + x79);
+  x105 = (x104 >> 25);
+  x106 = (uint32_t)(x104 & UINT32_C(0x1ffffff));
+  x107 = (x105 + x78);
+  x108 = (x107 >> 26);
+  x109 = (uint32_t)(x107 & UINT32_C(0x3ffffff));
+  x110 = (x108 + x77);
+  x111 = (x110 >> 25);
+  x112 = (uint32_t)(x110 & UINT32_C(0x1ffffff));
+  x113 = (x111 * UINT8_C(0x13));
+  x114 = (x76 + x113);
+  x115 = (uint32_t)(x114 >> 26);
+  x116 = (uint32_t)(x114 & UINT32_C(0x3ffffff));
+  x117 = (x115 + x88);
+  x118 = (fiat_25519_uint1)(x117 >> 25);
+  x119 = (x117 & UINT32_C(0x1ffffff));
+  x120 = (x118 + x91);
   out1[0] = x116;
   out1[1] = x119;
   out1[2] = x120;
@@ -446,37 +746,56 @@
 
 /*
  * The function fiat_25519_carry reduces a field element.
+ *
  * Postconditions:
  *   eval out1 mod m = eval arg1 mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
  */
-static void fiat_25519_carry(uint32_t out1[10], const uint32_t arg1[10]) {
-  uint32_t x1 = (arg1[0]);
-  uint32_t x2 = ((x1 >> 26) + (arg1[1]));
-  uint32_t x3 = ((x2 >> 25) + (arg1[2]));
-  uint32_t x4 = ((x3 >> 26) + (arg1[3]));
-  uint32_t x5 = ((x4 >> 25) + (arg1[4]));
-  uint32_t x6 = ((x5 >> 26) + (arg1[5]));
-  uint32_t x7 = ((x6 >> 25) + (arg1[6]));
-  uint32_t x8 = ((x7 >> 26) + (arg1[7]));
-  uint32_t x9 = ((x8 >> 25) + (arg1[8]));
-  uint32_t x10 = ((x9 >> 26) + (arg1[9]));
-  uint32_t x11 = ((x1 & UINT32_C(0x3ffffff)) + ((x10 >> 25) * UINT8_C(0x13)));
-  uint32_t x12 = ((fiat_25519_uint1)(x11 >> 26) + (x2 & UINT32_C(0x1ffffff)));
-  uint32_t x13 = (x11 & UINT32_C(0x3ffffff));
-  uint32_t x14 = (x12 & UINT32_C(0x1ffffff));
-  uint32_t x15 = ((fiat_25519_uint1)(x12 >> 25) + (x3 & UINT32_C(0x3ffffff)));
-  uint32_t x16 = (x4 & UINT32_C(0x1ffffff));
-  uint32_t x17 = (x5 & UINT32_C(0x3ffffff));
-  uint32_t x18 = (x6 & UINT32_C(0x1ffffff));
-  uint32_t x19 = (x7 & UINT32_C(0x3ffffff));
-  uint32_t x20 = (x8 & UINT32_C(0x1ffffff));
-  uint32_t x21 = (x9 & UINT32_C(0x3ffffff));
-  uint32_t x22 = (x10 & UINT32_C(0x1ffffff));
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  x1 = (arg1[0]);
+  x2 = ((x1 >> 26) + (arg1[1]));
+  x3 = ((x2 >> 25) + (arg1[2]));
+  x4 = ((x3 >> 26) + (arg1[3]));
+  x5 = ((x4 >> 25) + (arg1[4]));
+  x6 = ((x5 >> 26) + (arg1[5]));
+  x7 = ((x6 >> 25) + (arg1[6]));
+  x8 = ((x7 >> 26) + (arg1[7]));
+  x9 = ((x8 >> 25) + (arg1[8]));
+  x10 = ((x9 >> 26) + (arg1[9]));
+  x11 = ((x1 & UINT32_C(0x3ffffff)) + ((x10 >> 25) * UINT8_C(0x13)));
+  x12 = ((fiat_25519_uint1)(x11 >> 26) + (x2 & UINT32_C(0x1ffffff)));
+  x13 = (x11 & UINT32_C(0x3ffffff));
+  x14 = (x12 & UINT32_C(0x1ffffff));
+  x15 = ((fiat_25519_uint1)(x12 >> 25) + (x3 & UINT32_C(0x3ffffff)));
+  x16 = (x4 & UINT32_C(0x1ffffff));
+  x17 = (x5 & UINT32_C(0x3ffffff));
+  x18 = (x6 & UINT32_C(0x1ffffff));
+  x19 = (x7 & UINT32_C(0x3ffffff));
+  x20 = (x8 & UINT32_C(0x1ffffff));
+  x21 = (x9 & UINT32_C(0x3ffffff));
+  x22 = (x10 & UINT32_C(0x1ffffff));
   out1[0] = x13;
   out1[1] = x14;
   out1[2] = x15;
@@ -491,26 +810,32 @@
 
 /*
  * The function fiat_25519_add adds two field elements.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 + eval arg2) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
- *   arg2: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
  */
-static void fiat_25519_add(uint32_t out1[10], const uint32_t arg1[10], const uint32_t arg2[10]) {
-  uint32_t x1 = ((arg1[0]) + (arg2[0]));
-  uint32_t x2 = ((arg1[1]) + (arg2[1]));
-  uint32_t x3 = ((arg1[2]) + (arg2[2]));
-  uint32_t x4 = ((arg1[3]) + (arg2[3]));
-  uint32_t x5 = ((arg1[4]) + (arg2[4]));
-  uint32_t x6 = ((arg1[5]) + (arg2[5]));
-  uint32_t x7 = ((arg1[6]) + (arg2[6]));
-  uint32_t x8 = ((arg1[7]) + (arg2[7]));
-  uint32_t x9 = ((arg1[8]) + (arg2[8]));
-  uint32_t x10 = ((arg1[9]) + (arg2[9]));
+static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  x1 = ((arg1[0]) + (arg2[0]));
+  x2 = ((arg1[1]) + (arg2[1]));
+  x3 = ((arg1[2]) + (arg2[2]));
+  x4 = ((arg1[3]) + (arg2[3]));
+  x5 = ((arg1[4]) + (arg2[4]));
+  x6 = ((arg1[5]) + (arg2[5]));
+  x7 = ((arg1[6]) + (arg2[6]));
+  x8 = ((arg1[7]) + (arg2[7]));
+  x9 = ((arg1[8]) + (arg2[8]));
+  x10 = ((arg1[9]) + (arg2[9]));
   out1[0] = x1;
   out1[1] = x2;
   out1[2] = x3;
@@ -525,26 +850,32 @@
 
 /*
  * The function fiat_25519_sub subtracts two field elements.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 - eval arg2) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
- *   arg2: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
  */
-static void fiat_25519_sub(uint32_t out1[10], const uint32_t arg1[10], const uint32_t arg2[10]) {
-  uint32_t x1 = ((UINT32_C(0x7ffffda) + (arg1[0])) - (arg2[0]));
-  uint32_t x2 = ((UINT32_C(0x3fffffe) + (arg1[1])) - (arg2[1]));
-  uint32_t x3 = ((UINT32_C(0x7fffffe) + (arg1[2])) - (arg2[2]));
-  uint32_t x4 = ((UINT32_C(0x3fffffe) + (arg1[3])) - (arg2[3]));
-  uint32_t x5 = ((UINT32_C(0x7fffffe) + (arg1[4])) - (arg2[4]));
-  uint32_t x6 = ((UINT32_C(0x3fffffe) + (arg1[5])) - (arg2[5]));
-  uint32_t x7 = ((UINT32_C(0x7fffffe) + (arg1[6])) - (arg2[6]));
-  uint32_t x8 = ((UINT32_C(0x3fffffe) + (arg1[7])) - (arg2[7]));
-  uint32_t x9 = ((UINT32_C(0x7fffffe) + (arg1[8])) - (arg2[8]));
-  uint32_t x10 = ((UINT32_C(0x3fffffe) + (arg1[9])) - (arg2[9]));
+static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  x1 = ((UINT32_C(0x7ffffda) + (arg1[0])) - (arg2[0]));
+  x2 = ((UINT32_C(0x3fffffe) + (arg1[1])) - (arg2[1]));
+  x3 = ((UINT32_C(0x7fffffe) + (arg1[2])) - (arg2[2]));
+  x4 = ((UINT32_C(0x3fffffe) + (arg1[3])) - (arg2[3]));
+  x5 = ((UINT32_C(0x7fffffe) + (arg1[4])) - (arg2[4]));
+  x6 = ((UINT32_C(0x3fffffe) + (arg1[5])) - (arg2[5]));
+  x7 = ((UINT32_C(0x7fffffe) + (arg1[6])) - (arg2[6]));
+  x8 = ((UINT32_C(0x3fffffe) + (arg1[7])) - (arg2[7]));
+  x9 = ((UINT32_C(0x7fffffe) + (arg1[8])) - (arg2[8]));
+  x10 = ((UINT32_C(0x3fffffe) + (arg1[9])) - (arg2[9]));
   out1[0] = x1;
   out1[1] = x2;
   out1[2] = x3;
@@ -559,25 +890,32 @@
 
 /*
  * The function fiat_25519_opp negates a field element.
+ *
  * Postconditions:
  *   eval out1 mod m = -eval arg1 mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
  */
-static void fiat_25519_opp(uint32_t out1[10], const uint32_t arg1[10]) {
-  uint32_t x1 = (UINT32_C(0x7ffffda) - (arg1[0]));
-  uint32_t x2 = (UINT32_C(0x3fffffe) - (arg1[1]));
-  uint32_t x3 = (UINT32_C(0x7fffffe) - (arg1[2]));
-  uint32_t x4 = (UINT32_C(0x3fffffe) - (arg1[3]));
-  uint32_t x5 = (UINT32_C(0x7fffffe) - (arg1[4]));
-  uint32_t x6 = (UINT32_C(0x3fffffe) - (arg1[5]));
-  uint32_t x7 = (UINT32_C(0x7fffffe) - (arg1[6]));
-  uint32_t x8 = (UINT32_C(0x3fffffe) - (arg1[7]));
-  uint32_t x9 = (UINT32_C(0x7fffffe) - (arg1[8]));
-  uint32_t x10 = (UINT32_C(0x3fffffe) - (arg1[9]));
+static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  x1 = (UINT32_C(0x7ffffda) - (arg1[0]));
+  x2 = (UINT32_C(0x3fffffe) - (arg1[1]));
+  x3 = (UINT32_C(0x7fffffe) - (arg1[2]));
+  x4 = (UINT32_C(0x3fffffe) - (arg1[3]));
+  x5 = (UINT32_C(0x7fffffe) - (arg1[4]));
+  x6 = (UINT32_C(0x3fffffe) - (arg1[5]));
+  x7 = (UINT32_C(0x7fffffe) - (arg1[6]));
+  x8 = (UINT32_C(0x3fffffe) - (arg1[7]));
+  x9 = (UINT32_C(0x7fffffe) - (arg1[8]));
+  x10 = (UINT32_C(0x3fffffe) - (arg1[9]));
   out1[0] = x1;
   out1[1] = x2;
   out1[2] = x3;
@@ -592,6 +930,7 @@
 
 /*
  * The function fiat_25519_selectznz is a multi-limb conditional select.
+ *
  * Postconditions:
  *   eval out1 = (if arg1 = 0 then eval arg2 else eval arg3)
  *
@@ -602,26 +941,26 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_25519_selectznz(uint32_t out1[10], fiat_25519_uint1 arg1, const uint32_t arg2[10], const uint32_t arg3[10]) {
+static FIAT_25519_FIAT_INLINE void fiat_25519_selectznz(uint32_t out1[10], fiat_25519_uint1 arg1, const uint32_t arg2[10], const uint32_t arg3[10]) {
   uint32_t x1;
-  fiat_25519_cmovznz_u32(&x1, arg1, (arg2[0]), (arg3[0]));
   uint32_t x2;
-  fiat_25519_cmovznz_u32(&x2, arg1, (arg2[1]), (arg3[1]));
   uint32_t x3;
-  fiat_25519_cmovznz_u32(&x3, arg1, (arg2[2]), (arg3[2]));
   uint32_t x4;
-  fiat_25519_cmovznz_u32(&x4, arg1, (arg2[3]), (arg3[3]));
   uint32_t x5;
-  fiat_25519_cmovznz_u32(&x5, arg1, (arg2[4]), (arg3[4]));
   uint32_t x6;
-  fiat_25519_cmovznz_u32(&x6, arg1, (arg2[5]), (arg3[5]));
   uint32_t x7;
-  fiat_25519_cmovznz_u32(&x7, arg1, (arg2[6]), (arg3[6]));
   uint32_t x8;
-  fiat_25519_cmovznz_u32(&x8, arg1, (arg2[7]), (arg3[7]));
   uint32_t x9;
-  fiat_25519_cmovznz_u32(&x9, arg1, (arg2[8]), (arg3[8]));
   uint32_t x10;
+  fiat_25519_cmovznz_u32(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_25519_cmovznz_u32(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_25519_cmovznz_u32(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_25519_cmovznz_u32(&x4, arg1, (arg2[3]), (arg3[3]));
+  fiat_25519_cmovznz_u32(&x5, arg1, (arg2[4]), (arg3[4]));
+  fiat_25519_cmovznz_u32(&x6, arg1, (arg2[5]), (arg3[5]));
+  fiat_25519_cmovznz_u32(&x7, arg1, (arg2[6]), (arg3[6]));
+  fiat_25519_cmovznz_u32(&x8, arg1, (arg2[7]), (arg3[7]));
+  fiat_25519_cmovznz_u32(&x9, arg1, (arg2[8]), (arg3[8]));
   fiat_25519_cmovznz_u32(&x10, arg1, (arg2[9]), (arg3[9]));
   out1[0] = x1;
   out1[1] = x2;
@@ -637,336 +976,582 @@
 
 /*
  * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order.
+ *
  * Postconditions:
  *   out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31]
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
  * Output Bounds:
  *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
  */
-static void fiat_25519_to_bytes(uint8_t out1[32], const uint32_t arg1[10]) {
+static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) {
   uint32_t x1;
   fiat_25519_uint1 x2;
-  fiat_25519_subborrowx_u26(&x1, &x2, 0x0, (arg1[0]), UINT32_C(0x3ffffed));
   uint32_t x3;
   fiat_25519_uint1 x4;
-  fiat_25519_subborrowx_u25(&x3, &x4, x2, (arg1[1]), UINT32_C(0x1ffffff));
   uint32_t x5;
   fiat_25519_uint1 x6;
-  fiat_25519_subborrowx_u26(&x5, &x6, x4, (arg1[2]), UINT32_C(0x3ffffff));
   uint32_t x7;
   fiat_25519_uint1 x8;
-  fiat_25519_subborrowx_u25(&x7, &x8, x6, (arg1[3]), UINT32_C(0x1ffffff));
   uint32_t x9;
   fiat_25519_uint1 x10;
-  fiat_25519_subborrowx_u26(&x9, &x10, x8, (arg1[4]), UINT32_C(0x3ffffff));
   uint32_t x11;
   fiat_25519_uint1 x12;
-  fiat_25519_subborrowx_u25(&x11, &x12, x10, (arg1[5]), UINT32_C(0x1ffffff));
   uint32_t x13;
   fiat_25519_uint1 x14;
-  fiat_25519_subborrowx_u26(&x13, &x14, x12, (arg1[6]), UINT32_C(0x3ffffff));
   uint32_t x15;
   fiat_25519_uint1 x16;
-  fiat_25519_subborrowx_u25(&x15, &x16, x14, (arg1[7]), UINT32_C(0x1ffffff));
   uint32_t x17;
   fiat_25519_uint1 x18;
-  fiat_25519_subborrowx_u26(&x17, &x18, x16, (arg1[8]), UINT32_C(0x3ffffff));
   uint32_t x19;
   fiat_25519_uint1 x20;
-  fiat_25519_subborrowx_u25(&x19, &x20, x18, (arg1[9]), UINT32_C(0x1ffffff));
   uint32_t x21;
-  fiat_25519_cmovznz_u32(&x21, x20, 0x0, UINT32_C(0xffffffff));
   uint32_t x22;
   fiat_25519_uint1 x23;
-  fiat_25519_addcarryx_u26(&x22, &x23, 0x0, x1, (x21 & UINT32_C(0x3ffffed)));
   uint32_t x24;
   fiat_25519_uint1 x25;
-  fiat_25519_addcarryx_u25(&x24, &x25, x23, x3, (x21 & UINT32_C(0x1ffffff)));
   uint32_t x26;
   fiat_25519_uint1 x27;
-  fiat_25519_addcarryx_u26(&x26, &x27, x25, x5, (x21 & UINT32_C(0x3ffffff)));
   uint32_t x28;
   fiat_25519_uint1 x29;
-  fiat_25519_addcarryx_u25(&x28, &x29, x27, x7, (x21 & UINT32_C(0x1ffffff)));
   uint32_t x30;
   fiat_25519_uint1 x31;
-  fiat_25519_addcarryx_u26(&x30, &x31, x29, x9, (x21 & UINT32_C(0x3ffffff)));
   uint32_t x32;
   fiat_25519_uint1 x33;
-  fiat_25519_addcarryx_u25(&x32, &x33, x31, x11, (x21 & UINT32_C(0x1ffffff)));
   uint32_t x34;
   fiat_25519_uint1 x35;
-  fiat_25519_addcarryx_u26(&x34, &x35, x33, x13, (x21 & UINT32_C(0x3ffffff)));
   uint32_t x36;
   fiat_25519_uint1 x37;
-  fiat_25519_addcarryx_u25(&x36, &x37, x35, x15, (x21 & UINT32_C(0x1ffffff)));
   uint32_t x38;
   fiat_25519_uint1 x39;
-  fiat_25519_addcarryx_u26(&x38, &x39, x37, x17, (x21 & UINT32_C(0x3ffffff)));
   uint32_t x40;
   fiat_25519_uint1 x41;
+  uint32_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint32_t x47;
+  uint32_t x48;
+  uint32_t x49;
+  uint8_t x50;
+  uint32_t x51;
+  uint8_t x52;
+  uint32_t x53;
+  uint8_t x54;
+  uint8_t x55;
+  uint32_t x56;
+  uint8_t x57;
+  uint32_t x58;
+  uint8_t x59;
+  uint32_t x60;
+  uint8_t x61;
+  uint8_t x62;
+  uint32_t x63;
+  uint8_t x64;
+  uint32_t x65;
+  uint8_t x66;
+  uint32_t x67;
+  uint8_t x68;
+  uint8_t x69;
+  uint32_t x70;
+  uint8_t x71;
+  uint32_t x72;
+  uint8_t x73;
+  uint32_t x74;
+  uint8_t x75;
+  uint8_t x76;
+  uint32_t x77;
+  uint8_t x78;
+  uint32_t x79;
+  uint8_t x80;
+  uint32_t x81;
+  uint8_t x82;
+  uint8_t x83;
+  uint8_t x84;
+  uint32_t x85;
+  uint8_t x86;
+  uint32_t x87;
+  uint8_t x88;
+  fiat_25519_uint1 x89;
+  uint32_t x90;
+  uint8_t x91;
+  uint32_t x92;
+  uint8_t x93;
+  uint32_t x94;
+  uint8_t x95;
+  uint8_t x96;
+  uint32_t x97;
+  uint8_t x98;
+  uint32_t x99;
+  uint8_t x100;
+  uint32_t x101;
+  uint8_t x102;
+  uint8_t x103;
+  uint32_t x104;
+  uint8_t x105;
+  uint32_t x106;
+  uint8_t x107;
+  uint32_t x108;
+  uint8_t x109;
+  uint8_t x110;
+  uint32_t x111;
+  uint8_t x112;
+  uint32_t x113;
+  uint8_t x114;
+  uint32_t x115;
+  uint8_t x116;
+  uint8_t x117;
+  fiat_25519_subborrowx_u26(&x1, &x2, 0x0, (arg1[0]), UINT32_C(0x3ffffed));
+  fiat_25519_subborrowx_u25(&x3, &x4, x2, (arg1[1]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x5, &x6, x4, (arg1[2]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x7, &x8, x6, (arg1[3]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x9, &x10, x8, (arg1[4]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x11, &x12, x10, (arg1[5]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x13, &x14, x12, (arg1[6]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x15, &x16, x14, (arg1[7]), UINT32_C(0x1ffffff));
+  fiat_25519_subborrowx_u26(&x17, &x18, x16, (arg1[8]), UINT32_C(0x3ffffff));
+  fiat_25519_subborrowx_u25(&x19, &x20, x18, (arg1[9]), UINT32_C(0x1ffffff));
+  fiat_25519_cmovznz_u32(&x21, x20, 0x0, UINT32_C(0xffffffff));
+  fiat_25519_addcarryx_u26(&x22, &x23, 0x0, x1, (x21 & UINT32_C(0x3ffffed)));
+  fiat_25519_addcarryx_u25(&x24, &x25, x23, x3, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x26, &x27, x25, x5, (x21 & UINT32_C(0x3ffffff)));
+  fiat_25519_addcarryx_u25(&x28, &x29, x27, x7, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x30, &x31, x29, x9, (x21 & UINT32_C(0x3ffffff)));
+  fiat_25519_addcarryx_u25(&x32, &x33, x31, x11, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x34, &x35, x33, x13, (x21 & UINT32_C(0x3ffffff)));
+  fiat_25519_addcarryx_u25(&x36, &x37, x35, x15, (x21 & UINT32_C(0x1ffffff)));
+  fiat_25519_addcarryx_u26(&x38, &x39, x37, x17, (x21 & UINT32_C(0x3ffffff)));
   fiat_25519_addcarryx_u25(&x40, &x41, x39, x19, (x21 & UINT32_C(0x1ffffff)));
-  uint32_t x42 = (x40 << 6);
-  uint32_t x43 = (x38 << 4);
-  uint32_t x44 = (x36 << 3);
-  uint32_t x45 = (x34 * (uint32_t)0x2);
-  uint32_t x46 = (x30 << 6);
-  uint32_t x47 = (x28 << 5);
-  uint32_t x48 = (x26 << 3);
-  uint32_t x49 = (x24 << 2);
-  uint32_t x50 = (x22 >> 8);
-  uint8_t x51 = (uint8_t)(x22 & UINT8_C(0xff));
-  uint32_t x52 = (x50 >> 8);
-  uint8_t x53 = (uint8_t)(x50 & UINT8_C(0xff));
-  uint8_t x54 = (uint8_t)(x52 >> 8);
-  uint8_t x55 = (uint8_t)(x52 & UINT8_C(0xff));
-  uint32_t x56 = (x54 + x49);
-  uint32_t x57 = (x56 >> 8);
-  uint8_t x58 = (uint8_t)(x56 & UINT8_C(0xff));
-  uint32_t x59 = (x57 >> 8);
-  uint8_t x60 = (uint8_t)(x57 & UINT8_C(0xff));
-  uint8_t x61 = (uint8_t)(x59 >> 8);
-  uint8_t x62 = (uint8_t)(x59 & UINT8_C(0xff));
-  uint32_t x63 = (x61 + x48);
-  uint32_t x64 = (x63 >> 8);
-  uint8_t x65 = (uint8_t)(x63 & UINT8_C(0xff));
-  uint32_t x66 = (x64 >> 8);
-  uint8_t x67 = (uint8_t)(x64 & UINT8_C(0xff));
-  uint8_t x68 = (uint8_t)(x66 >> 8);
-  uint8_t x69 = (uint8_t)(x66 & UINT8_C(0xff));
-  uint32_t x70 = (x68 + x47);
-  uint32_t x71 = (x70 >> 8);
-  uint8_t x72 = (uint8_t)(x70 & UINT8_C(0xff));
-  uint32_t x73 = (x71 >> 8);
-  uint8_t x74 = (uint8_t)(x71 & UINT8_C(0xff));
-  uint8_t x75 = (uint8_t)(x73 >> 8);
-  uint8_t x76 = (uint8_t)(x73 & UINT8_C(0xff));
-  uint32_t x77 = (x75 + x46);
-  uint32_t x78 = (x77 >> 8);
-  uint8_t x79 = (uint8_t)(x77 & UINT8_C(0xff));
-  uint32_t x80 = (x78 >> 8);
-  uint8_t x81 = (uint8_t)(x78 & UINT8_C(0xff));
-  uint8_t x82 = (uint8_t)(x80 >> 8);
-  uint8_t x83 = (uint8_t)(x80 & UINT8_C(0xff));
-  uint8_t x84 = (uint8_t)(x82 & UINT8_C(0xff));
-  uint32_t x85 = (x32 >> 8);
-  uint8_t x86 = (uint8_t)(x32 & UINT8_C(0xff));
-  uint32_t x87 = (x85 >> 8);
-  uint8_t x88 = (uint8_t)(x85 & UINT8_C(0xff));
-  fiat_25519_uint1 x89 = (fiat_25519_uint1)(x87 >> 8);
-  uint8_t x90 = (uint8_t)(x87 & UINT8_C(0xff));
-  uint32_t x91 = (x89 + x45);
-  uint32_t x92 = (x91 >> 8);
-  uint8_t x93 = (uint8_t)(x91 & UINT8_C(0xff));
-  uint32_t x94 = (x92 >> 8);
-  uint8_t x95 = (uint8_t)(x92 & UINT8_C(0xff));
-  uint8_t x96 = (uint8_t)(x94 >> 8);
-  uint8_t x97 = (uint8_t)(x94 & UINT8_C(0xff));
-  uint32_t x98 = (x96 + x44);
-  uint32_t x99 = (x98 >> 8);
-  uint8_t x100 = (uint8_t)(x98 & UINT8_C(0xff));
-  uint32_t x101 = (x99 >> 8);
-  uint8_t x102 = (uint8_t)(x99 & UINT8_C(0xff));
-  uint8_t x103 = (uint8_t)(x101 >> 8);
-  uint8_t x104 = (uint8_t)(x101 & UINT8_C(0xff));
-  uint32_t x105 = (x103 + x43);
-  uint32_t x106 = (x105 >> 8);
-  uint8_t x107 = (uint8_t)(x105 & UINT8_C(0xff));
-  uint32_t x108 = (x106 >> 8);
-  uint8_t x109 = (uint8_t)(x106 & UINT8_C(0xff));
-  uint8_t x110 = (uint8_t)(x108 >> 8);
-  uint8_t x111 = (uint8_t)(x108 & UINT8_C(0xff));
-  uint32_t x112 = (x110 + x42);
-  uint32_t x113 = (x112 >> 8);
-  uint8_t x114 = (uint8_t)(x112 & UINT8_C(0xff));
-  uint32_t x115 = (x113 >> 8);
-  uint8_t x116 = (uint8_t)(x113 & UINT8_C(0xff));
-  uint8_t x117 = (uint8_t)(x115 >> 8);
-  uint8_t x118 = (uint8_t)(x115 & UINT8_C(0xff));
-  out1[0] = x51;
-  out1[1] = x53;
-  out1[2] = x55;
-  out1[3] = x58;
-  out1[4] = x60;
-  out1[5] = x62;
-  out1[6] = x65;
-  out1[7] = x67;
-  out1[8] = x69;
-  out1[9] = x72;
-  out1[10] = x74;
-  out1[11] = x76;
-  out1[12] = x79;
-  out1[13] = x81;
-  out1[14] = x83;
-  out1[15] = x84;
-  out1[16] = x86;
-  out1[17] = x88;
-  out1[18] = x90;
-  out1[19] = x93;
-  out1[20] = x95;
-  out1[21] = x97;
-  out1[22] = x100;
-  out1[23] = x102;
-  out1[24] = x104;
-  out1[25] = x107;
-  out1[26] = x109;
-  out1[27] = x111;
-  out1[28] = x114;
-  out1[29] = x116;
-  out1[30] = x118;
+  x42 = (x40 << 6);
+  x43 = (x38 << 4);
+  x44 = (x36 << 3);
+  x45 = (x34 * (uint32_t)0x2);
+  x46 = (x30 << 6);
+  x47 = (x28 << 5);
+  x48 = (x26 << 3);
+  x49 = (x24 << 2);
+  x50 = (uint8_t)(x22 & UINT8_C(0xff));
+  x51 = (x22 >> 8);
+  x52 = (uint8_t)(x51 & UINT8_C(0xff));
+  x53 = (x51 >> 8);
+  x54 = (uint8_t)(x53 & UINT8_C(0xff));
+  x55 = (uint8_t)(x53 >> 8);
+  x56 = (x49 + (uint32_t)x55);
+  x57 = (uint8_t)(x56 & UINT8_C(0xff));
+  x58 = (x56 >> 8);
+  x59 = (uint8_t)(x58 & UINT8_C(0xff));
+  x60 = (x58 >> 8);
+  x61 = (uint8_t)(x60 & UINT8_C(0xff));
+  x62 = (uint8_t)(x60 >> 8);
+  x63 = (x48 + (uint32_t)x62);
+  x64 = (uint8_t)(x63 & UINT8_C(0xff));
+  x65 = (x63 >> 8);
+  x66 = (uint8_t)(x65 & UINT8_C(0xff));
+  x67 = (x65 >> 8);
+  x68 = (uint8_t)(x67 & UINT8_C(0xff));
+  x69 = (uint8_t)(x67 >> 8);
+  x70 = (x47 + (uint32_t)x69);
+  x71 = (uint8_t)(x70 & UINT8_C(0xff));
+  x72 = (x70 >> 8);
+  x73 = (uint8_t)(x72 & UINT8_C(0xff));
+  x74 = (x72 >> 8);
+  x75 = (uint8_t)(x74 & UINT8_C(0xff));
+  x76 = (uint8_t)(x74 >> 8);
+  x77 = (x46 + (uint32_t)x76);
+  x78 = (uint8_t)(x77 & UINT8_C(0xff));
+  x79 = (x77 >> 8);
+  x80 = (uint8_t)(x79 & UINT8_C(0xff));
+  x81 = (x79 >> 8);
+  x82 = (uint8_t)(x81 & UINT8_C(0xff));
+  x83 = (uint8_t)(x81 >> 8);
+  x84 = (uint8_t)(x32 & UINT8_C(0xff));
+  x85 = (x32 >> 8);
+  x86 = (uint8_t)(x85 & UINT8_C(0xff));
+  x87 = (x85 >> 8);
+  x88 = (uint8_t)(x87 & UINT8_C(0xff));
+  x89 = (fiat_25519_uint1)(x87 >> 8);
+  x90 = (x45 + (uint32_t)x89);
+  x91 = (uint8_t)(x90 & UINT8_C(0xff));
+  x92 = (x90 >> 8);
+  x93 = (uint8_t)(x92 & UINT8_C(0xff));
+  x94 = (x92 >> 8);
+  x95 = (uint8_t)(x94 & UINT8_C(0xff));
+  x96 = (uint8_t)(x94 >> 8);
+  x97 = (x44 + (uint32_t)x96);
+  x98 = (uint8_t)(x97 & UINT8_C(0xff));
+  x99 = (x97 >> 8);
+  x100 = (uint8_t)(x99 & UINT8_C(0xff));
+  x101 = (x99 >> 8);
+  x102 = (uint8_t)(x101 & UINT8_C(0xff));
+  x103 = (uint8_t)(x101 >> 8);
+  x104 = (x43 + (uint32_t)x103);
+  x105 = (uint8_t)(x104 & UINT8_C(0xff));
+  x106 = (x104 >> 8);
+  x107 = (uint8_t)(x106 & UINT8_C(0xff));
+  x108 = (x106 >> 8);
+  x109 = (uint8_t)(x108 & UINT8_C(0xff));
+  x110 = (uint8_t)(x108 >> 8);
+  x111 = (x42 + (uint32_t)x110);
+  x112 = (uint8_t)(x111 & UINT8_C(0xff));
+  x113 = (x111 >> 8);
+  x114 = (uint8_t)(x113 & UINT8_C(0xff));
+  x115 = (x113 >> 8);
+  x116 = (uint8_t)(x115 & UINT8_C(0xff));
+  x117 = (uint8_t)(x115 >> 8);
+  out1[0] = x50;
+  out1[1] = x52;
+  out1[2] = x54;
+  out1[3] = x57;
+  out1[4] = x59;
+  out1[5] = x61;
+  out1[6] = x64;
+  out1[7] = x66;
+  out1[8] = x68;
+  out1[9] = x71;
+  out1[10] = x73;
+  out1[11] = x75;
+  out1[12] = x78;
+  out1[13] = x80;
+  out1[14] = x82;
+  out1[15] = x83;
+  out1[16] = x84;
+  out1[17] = x86;
+  out1[18] = x88;
+  out1[19] = x91;
+  out1[20] = x93;
+  out1[21] = x95;
+  out1[22] = x98;
+  out1[23] = x100;
+  out1[24] = x102;
+  out1[25] = x105;
+  out1[26] = x107;
+  out1[27] = x109;
+  out1[28] = x112;
+  out1[29] = x114;
+  out1[30] = x116;
   out1[31] = x117;
 }
 
 /*
  * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order.
+ *
  * Postconditions:
  *   eval out1 mod m = bytes_eval arg1 mod m
  *
  * Input Bounds:
  *   arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
  */
-static void fiat_25519_from_bytes(uint32_t out1[10], const uint8_t arg1[32]) {
-  uint32_t x1 = ((uint32_t)(arg1[31]) << 18);
-  uint32_t x2 = ((uint32_t)(arg1[30]) << 10);
-  uint32_t x3 = ((uint32_t)(arg1[29]) << 2);
-  uint32_t x4 = ((uint32_t)(arg1[28]) << 20);
-  uint32_t x5 = ((uint32_t)(arg1[27]) << 12);
-  uint32_t x6 = ((uint32_t)(arg1[26]) << 4);
-  uint32_t x7 = ((uint32_t)(arg1[25]) << 21);
-  uint32_t x8 = ((uint32_t)(arg1[24]) << 13);
-  uint32_t x9 = ((uint32_t)(arg1[23]) << 5);
-  uint32_t x10 = ((uint32_t)(arg1[22]) << 23);
-  uint32_t x11 = ((uint32_t)(arg1[21]) << 15);
-  uint32_t x12 = ((uint32_t)(arg1[20]) << 7);
-  uint32_t x13 = ((uint32_t)(arg1[19]) << 24);
-  uint32_t x14 = ((uint32_t)(arg1[18]) << 16);
-  uint32_t x15 = ((uint32_t)(arg1[17]) << 8);
-  uint8_t x16 = (arg1[16]);
-  uint32_t x17 = ((uint32_t)(arg1[15]) << 18);
-  uint32_t x18 = ((uint32_t)(arg1[14]) << 10);
-  uint32_t x19 = ((uint32_t)(arg1[13]) << 2);
-  uint32_t x20 = ((uint32_t)(arg1[12]) << 19);
-  uint32_t x21 = ((uint32_t)(arg1[11]) << 11);
-  uint32_t x22 = ((uint32_t)(arg1[10]) << 3);
-  uint32_t x23 = ((uint32_t)(arg1[9]) << 21);
-  uint32_t x24 = ((uint32_t)(arg1[8]) << 13);
-  uint32_t x25 = ((uint32_t)(arg1[7]) << 5);
-  uint32_t x26 = ((uint32_t)(arg1[6]) << 22);
-  uint32_t x27 = ((uint32_t)(arg1[5]) << 14);
-  uint32_t x28 = ((uint32_t)(arg1[4]) << 6);
-  uint32_t x29 = ((uint32_t)(arg1[3]) << 24);
-  uint32_t x30 = ((uint32_t)(arg1[2]) << 16);
-  uint32_t x31 = ((uint32_t)(arg1[1]) << 8);
-  uint8_t x32 = (arg1[0]);
-  uint32_t x33 = (x32 + (x31 + (x30 + x29)));
-  uint8_t x34 = (uint8_t)(x33 >> 26);
-  uint32_t x35 = (x33 & UINT32_C(0x3ffffff));
-  uint32_t x36 = (x3 + (x2 + x1));
-  uint32_t x37 = (x6 + (x5 + x4));
-  uint32_t x38 = (x9 + (x8 + x7));
-  uint32_t x39 = (x12 + (x11 + x10));
-  uint32_t x40 = (x16 + (x15 + (x14 + x13)));
-  uint32_t x41 = (x19 + (x18 + x17));
-  uint32_t x42 = (x22 + (x21 + x20));
-  uint32_t x43 = (x25 + (x24 + x23));
-  uint32_t x44 = (x28 + (x27 + x26));
-  uint32_t x45 = (x34 + x44);
-  uint8_t x46 = (uint8_t)(x45 >> 25);
-  uint32_t x47 = (x45 & UINT32_C(0x1ffffff));
-  uint32_t x48 = (x46 + x43);
-  uint8_t x49 = (uint8_t)(x48 >> 26);
-  uint32_t x50 = (x48 & UINT32_C(0x3ffffff));
-  uint32_t x51 = (x49 + x42);
-  uint8_t x52 = (uint8_t)(x51 >> 25);
-  uint32_t x53 = (x51 & UINT32_C(0x1ffffff));
-  uint32_t x54 = (x52 + x41);
-  uint32_t x55 = (x54 & UINT32_C(0x3ffffff));
-  uint8_t x56 = (uint8_t)(x40 >> 25);
-  uint32_t x57 = (x40 & UINT32_C(0x1ffffff));
-  uint32_t x58 = (x56 + x39);
-  uint8_t x59 = (uint8_t)(x58 >> 26);
-  uint32_t x60 = (x58 & UINT32_C(0x3ffffff));
-  uint32_t x61 = (x59 + x38);
-  uint8_t x62 = (uint8_t)(x61 >> 25);
-  uint32_t x63 = (x61 & UINT32_C(0x1ffffff));
-  uint32_t x64 = (x62 + x37);
-  uint8_t x65 = (uint8_t)(x64 >> 26);
-  uint32_t x66 = (x64 & UINT32_C(0x3ffffff));
-  uint32_t x67 = (x65 + x36);
-  out1[0] = x35;
-  out1[1] = x47;
-  out1[2] = x50;
-  out1[3] = x53;
+static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint8_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  uint32_t x23;
+  uint32_t x24;
+  uint32_t x25;
+  uint32_t x26;
+  uint32_t x27;
+  uint32_t x28;
+  uint32_t x29;
+  uint32_t x30;
+  uint32_t x31;
+  uint8_t x32;
+  uint32_t x33;
+  uint32_t x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint8_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  uint8_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint8_t x47;
+  uint32_t x48;
+  uint32_t x49;
+  uint32_t x50;
+  uint32_t x51;
+  uint8_t x52;
+  uint32_t x53;
+  uint32_t x54;
+  uint32_t x55;
+  uint32_t x56;
+  uint32_t x57;
+  uint32_t x58;
+  uint32_t x59;
+  uint8_t x60;
+  uint32_t x61;
+  uint32_t x62;
+  uint32_t x63;
+  uint32_t x64;
+  uint8_t x65;
+  uint32_t x66;
+  uint32_t x67;
+  uint32_t x68;
+  uint32_t x69;
+  uint8_t x70;
+  uint32_t x71;
+  uint32_t x72;
+  uint32_t x73;
+  uint32_t x74;
+  uint8_t x75;
+  uint32_t x76;
+  uint32_t x77;
+  uint32_t x78;
+  x1 = ((uint32_t)(arg1[31]) << 18);
+  x2 = ((uint32_t)(arg1[30]) << 10);
+  x3 = ((uint32_t)(arg1[29]) << 2);
+  x4 = ((uint32_t)(arg1[28]) << 20);
+  x5 = ((uint32_t)(arg1[27]) << 12);
+  x6 = ((uint32_t)(arg1[26]) << 4);
+  x7 = ((uint32_t)(arg1[25]) << 21);
+  x8 = ((uint32_t)(arg1[24]) << 13);
+  x9 = ((uint32_t)(arg1[23]) << 5);
+  x10 = ((uint32_t)(arg1[22]) << 23);
+  x11 = ((uint32_t)(arg1[21]) << 15);
+  x12 = ((uint32_t)(arg1[20]) << 7);
+  x13 = ((uint32_t)(arg1[19]) << 24);
+  x14 = ((uint32_t)(arg1[18]) << 16);
+  x15 = ((uint32_t)(arg1[17]) << 8);
+  x16 = (arg1[16]);
+  x17 = ((uint32_t)(arg1[15]) << 18);
+  x18 = ((uint32_t)(arg1[14]) << 10);
+  x19 = ((uint32_t)(arg1[13]) << 2);
+  x20 = ((uint32_t)(arg1[12]) << 19);
+  x21 = ((uint32_t)(arg1[11]) << 11);
+  x22 = ((uint32_t)(arg1[10]) << 3);
+  x23 = ((uint32_t)(arg1[9]) << 21);
+  x24 = ((uint32_t)(arg1[8]) << 13);
+  x25 = ((uint32_t)(arg1[7]) << 5);
+  x26 = ((uint32_t)(arg1[6]) << 22);
+  x27 = ((uint32_t)(arg1[5]) << 14);
+  x28 = ((uint32_t)(arg1[4]) << 6);
+  x29 = ((uint32_t)(arg1[3]) << 24);
+  x30 = ((uint32_t)(arg1[2]) << 16);
+  x31 = ((uint32_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint32_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x35 & UINT32_C(0x3ffffff));
+  x37 = (uint8_t)(x35 >> 26);
+  x38 = (x28 + (uint32_t)x37);
+  x39 = (x27 + x38);
+  x40 = (x26 + x39);
+  x41 = (x40 & UINT32_C(0x1ffffff));
+  x42 = (uint8_t)(x40 >> 25);
+  x43 = (x25 + (uint32_t)x42);
+  x44 = (x24 + x43);
+  x45 = (x23 + x44);
+  x46 = (x45 & UINT32_C(0x3ffffff));
+  x47 = (uint8_t)(x45 >> 26);
+  x48 = (x22 + (uint32_t)x47);
+  x49 = (x21 + x48);
+  x50 = (x20 + x49);
+  x51 = (x50 & UINT32_C(0x1ffffff));
+  x52 = (uint8_t)(x50 >> 25);
+  x53 = (x19 + (uint32_t)x52);
+  x54 = (x18 + x53);
+  x55 = (x17 + x54);
+  x56 = (x15 + (uint32_t)x16);
+  x57 = (x14 + x56);
+  x58 = (x13 + x57);
+  x59 = (x58 & UINT32_C(0x1ffffff));
+  x60 = (uint8_t)(x58 >> 25);
+  x61 = (x12 + (uint32_t)x60);
+  x62 = (x11 + x61);
+  x63 = (x10 + x62);
+  x64 = (x63 & UINT32_C(0x3ffffff));
+  x65 = (uint8_t)(x63 >> 26);
+  x66 = (x9 + (uint32_t)x65);
+  x67 = (x8 + x66);
+  x68 = (x7 + x67);
+  x69 = (x68 & UINT32_C(0x1ffffff));
+  x70 = (uint8_t)(x68 >> 25);
+  x71 = (x6 + (uint32_t)x70);
+  x72 = (x5 + x71);
+  x73 = (x4 + x72);
+  x74 = (x73 & UINT32_C(0x3ffffff));
+  x75 = (uint8_t)(x73 >> 26);
+  x76 = (x3 + (uint32_t)x75);
+  x77 = (x2 + x76);
+  x78 = (x1 + x77);
+  out1[0] = x36;
+  out1[1] = x41;
+  out1[2] = x46;
+  out1[3] = x51;
   out1[4] = x55;
-  out1[5] = x57;
-  out1[6] = x60;
-  out1[7] = x63;
-  out1[8] = x66;
-  out1[9] = x67;
+  out1[5] = x59;
+  out1[6] = x64;
+  out1[7] = x69;
+  out1[8] = x74;
+  out1[9] = x78;
+}
+
+/*
+ * The function fiat_25519_relax is the identity function converting from tight field elements to loose field elements.
+ *
+ * Postconditions:
+ *   out1 = arg1
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_relax(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  x1 = (arg1[0]);
+  x2 = (arg1[1]);
+  x3 = (arg1[2]);
+  x4 = (arg1[3]);
+  x5 = (arg1[4]);
+  x6 = (arg1[5]);
+  x7 = (arg1[6]);
+  x8 = (arg1[7]);
+  x9 = (arg1[8]);
+  x10 = (arg1[9]);
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
+  out1[5] = x6;
+  out1[6] = x7;
+  out1[7] = x8;
+  out1[8] = x9;
+  out1[9] = x10;
 }
 
 /*
  * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result.
+ *
  * Postconditions:
  *   eval out1 mod m = (121666 * eval arg1) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
  */
-static void fiat_25519_carry_scmul_121666(uint32_t out1[10], const uint32_t arg1[10]) {
-  uint64_t x1 = ((uint64_t)UINT32_C(0x1db42) * (arg1[9]));
-  uint64_t x2 = ((uint64_t)UINT32_C(0x1db42) * (arg1[8]));
-  uint64_t x3 = ((uint64_t)UINT32_C(0x1db42) * (arg1[7]));
-  uint64_t x4 = ((uint64_t)UINT32_C(0x1db42) * (arg1[6]));
-  uint64_t x5 = ((uint64_t)UINT32_C(0x1db42) * (arg1[5]));
-  uint64_t x6 = ((uint64_t)UINT32_C(0x1db42) * (arg1[4]));
-  uint64_t x7 = ((uint64_t)UINT32_C(0x1db42) * (arg1[3]));
-  uint64_t x8 = ((uint64_t)UINT32_C(0x1db42) * (arg1[2]));
-  uint64_t x9 = ((uint64_t)UINT32_C(0x1db42) * (arg1[1]));
-  uint64_t x10 = ((uint64_t)UINT32_C(0x1db42) * (arg1[0]));
-  uint32_t x11 = (uint32_t)(x10 >> 26);
-  uint32_t x12 = (uint32_t)(x10 & UINT32_C(0x3ffffff));
-  uint64_t x13 = (x11 + x9);
-  uint32_t x14 = (uint32_t)(x13 >> 25);
-  uint32_t x15 = (uint32_t)(x13 & UINT32_C(0x1ffffff));
-  uint64_t x16 = (x14 + x8);
-  uint32_t x17 = (uint32_t)(x16 >> 26);
-  uint32_t x18 = (uint32_t)(x16 & UINT32_C(0x3ffffff));
-  uint64_t x19 = (x17 + x7);
-  uint32_t x20 = (uint32_t)(x19 >> 25);
-  uint32_t x21 = (uint32_t)(x19 & UINT32_C(0x1ffffff));
-  uint64_t x22 = (x20 + x6);
-  uint32_t x23 = (uint32_t)(x22 >> 26);
-  uint32_t x24 = (uint32_t)(x22 & UINT32_C(0x3ffffff));
-  uint64_t x25 = (x23 + x5);
-  uint32_t x26 = (uint32_t)(x25 >> 25);
-  uint32_t x27 = (uint32_t)(x25 & UINT32_C(0x1ffffff));
-  uint64_t x28 = (x26 + x4);
-  uint32_t x29 = (uint32_t)(x28 >> 26);
-  uint32_t x30 = (uint32_t)(x28 & UINT32_C(0x3ffffff));
-  uint64_t x31 = (x29 + x3);
-  uint32_t x32 = (uint32_t)(x31 >> 25);
-  uint32_t x33 = (uint32_t)(x31 & UINT32_C(0x1ffffff));
-  uint64_t x34 = (x32 + x2);
-  uint32_t x35 = (uint32_t)(x34 >> 26);
-  uint32_t x36 = (uint32_t)(x34 & UINT32_C(0x3ffffff));
-  uint64_t x37 = (x35 + x1);
-  uint32_t x38 = (uint32_t)(x37 >> 25);
-  uint32_t x39 = (uint32_t)(x37 & UINT32_C(0x1ffffff));
-  uint32_t x40 = (x38 * UINT8_C(0x13));
-  uint32_t x41 = (x12 + x40);
-  fiat_25519_uint1 x42 = (fiat_25519_uint1)(x41 >> 26);
-  uint32_t x43 = (x41 & UINT32_C(0x3ffffff));
-  uint32_t x44 = (x42 + x15);
-  fiat_25519_uint1 x45 = (fiat_25519_uint1)(x44 >> 25);
-  uint32_t x46 = (x44 & UINT32_C(0x1ffffff));
-  uint32_t x47 = (x45 + x18);
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint64_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint64_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint64_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint64_t x22;
+  uint32_t x23;
+  uint32_t x24;
+  uint64_t x25;
+  uint32_t x26;
+  uint32_t x27;
+  uint64_t x28;
+  uint32_t x29;
+  uint32_t x30;
+  uint64_t x31;
+  uint32_t x32;
+  uint32_t x33;
+  uint64_t x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint64_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  fiat_25519_uint1 x42;
+  uint32_t x43;
+  uint32_t x44;
+  fiat_25519_uint1 x45;
+  uint32_t x46;
+  uint32_t x47;
+  x1 = ((uint64_t)UINT32_C(0x1db42) * (arg1[9]));
+  x2 = ((uint64_t)UINT32_C(0x1db42) * (arg1[8]));
+  x3 = ((uint64_t)UINT32_C(0x1db42) * (arg1[7]));
+  x4 = ((uint64_t)UINT32_C(0x1db42) * (arg1[6]));
+  x5 = ((uint64_t)UINT32_C(0x1db42) * (arg1[5]));
+  x6 = ((uint64_t)UINT32_C(0x1db42) * (arg1[4]));
+  x7 = ((uint64_t)UINT32_C(0x1db42) * (arg1[3]));
+  x8 = ((uint64_t)UINT32_C(0x1db42) * (arg1[2]));
+  x9 = ((uint64_t)UINT32_C(0x1db42) * (arg1[1]));
+  x10 = ((uint64_t)UINT32_C(0x1db42) * (arg1[0]));
+  x11 = (uint32_t)(x10 >> 26);
+  x12 = (uint32_t)(x10 & UINT32_C(0x3ffffff));
+  x13 = (x11 + x9);
+  x14 = (uint32_t)(x13 >> 25);
+  x15 = (uint32_t)(x13 & UINT32_C(0x1ffffff));
+  x16 = (x14 + x8);
+  x17 = (uint32_t)(x16 >> 26);
+  x18 = (uint32_t)(x16 & UINT32_C(0x3ffffff));
+  x19 = (x17 + x7);
+  x20 = (uint32_t)(x19 >> 25);
+  x21 = (uint32_t)(x19 & UINT32_C(0x1ffffff));
+  x22 = (x20 + x6);
+  x23 = (uint32_t)(x22 >> 26);
+  x24 = (uint32_t)(x22 & UINT32_C(0x3ffffff));
+  x25 = (x23 + x5);
+  x26 = (uint32_t)(x25 >> 25);
+  x27 = (uint32_t)(x25 & UINT32_C(0x1ffffff));
+  x28 = (x26 + x4);
+  x29 = (uint32_t)(x28 >> 26);
+  x30 = (uint32_t)(x28 & UINT32_C(0x3ffffff));
+  x31 = (x29 + x3);
+  x32 = (uint32_t)(x31 >> 25);
+  x33 = (uint32_t)(x31 & UINT32_C(0x1ffffff));
+  x34 = (x32 + x2);
+  x35 = (uint32_t)(x34 >> 26);
+  x36 = (uint32_t)(x34 & UINT32_C(0x3ffffff));
+  x37 = (x35 + x1);
+  x38 = (uint32_t)(x37 >> 25);
+  x39 = (uint32_t)(x37 & UINT32_C(0x1ffffff));
+  x40 = (x38 * UINT8_C(0x13));
+  x41 = (x12 + x40);
+  x42 = (fiat_25519_uint1)(x41 >> 26);
+  x43 = (x41 & UINT32_C(0x3ffffff));
+  x44 = (x42 + x15);
+  x45 = (fiat_25519_uint1)(x44 >> 25);
+  x46 = (x44 & UINT32_C(0x1ffffff));
+  x47 = (x45 + x18);
   out1[0] = x43;
   out1[1] = x46;
   out1[2] = x47;
@@ -978,4 +1563,3 @@
   out1[8] = x36;
   out1[9] = x39;
 }
-
diff --git a/third_party/fiat/curve25519_64.h b/third_party/fiat/curve25519_64.h
index 02679bb..faed049 100644
--- a/third_party/fiat/curve25519_64.h
+++ b/third_party/fiat/curve25519_64.h
@@ -1,26 +1,56 @@
-/* Autogenerated: src/ExtractionOCaml/unsaturated_solinas --static 25519 5 '2^255 - 19' 64 carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes carry_scmul121666 */
+/* Autogenerated: 'src/ExtractionOCaml/unsaturated_solinas' --inline --static --use-value-barrier 25519 64 '(auto)' '2^255 - 19' carry_mul carry_square carry add sub opp selectznz to_bytes from_bytes relax carry_scmul121666 */
 /* curve description: 25519 */
-/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, carry_scmul121666 */
-/* n = 5 (from "5") */
-/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */
 /* machine_wordsize = 64 (from "64") */
-
+/* requested operations: carry_mul, carry_square, carry, add, sub, opp, selectznz, to_bytes, from_bytes, relax, carry_scmul121666 */
+/* n = 5 (from "(auto)") */
+/* s-c = 2^255 - [(1, 19)] (from "2^255 - 19") */
+/* tight_bounds_multiplier = 1 (from "") */
+/*  */
 /* Computed values: */
-/* carry_chain = [0, 1, 2, 3, 4, 0, 1] */
+/*   carry_chain = [0, 1, 2, 3, 4, 0, 1] */
+/*   eval z = z[0] + (z[1] << 51) + (z[2] << 102) + (z[3] << 153) + (z[4] << 204) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   balance = [0xfffffffffffda, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe, 0xffffffffffffe] */
 
 #include <stdint.h>
 typedef unsigned char fiat_25519_uint1;
 typedef signed char fiat_25519_int1;
-typedef signed __int128 fiat_25519_int128;
-typedef unsigned __int128 fiat_25519_uint128;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_25519_FIAT_EXTENSION __extension__
+#  define FIAT_25519_FIAT_INLINE __inline__
+#else
+#  define FIAT_25519_FIAT_EXTENSION
+#  define FIAT_25519_FIAT_INLINE
+#endif
+
+FIAT_25519_FIAT_EXTENSION typedef signed __int128 fiat_25519_int128;
+FIAT_25519_FIAT_EXTENSION typedef unsigned __int128 fiat_25519_uint128;
+
+/* The type fiat_25519_loose_field_element is a field element with loose bounds. */
+/* Bounds: [[0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000], [0x0 ~> 0x18000000000000]] */
+typedef uint64_t fiat_25519_loose_field_element[5];
+
+/* The type fiat_25519_tight_field_element is a field element with tight bounds. */
+/* Bounds: [[0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000], [0x0 ~> 0x8000000000000]] */
+typedef uint64_t fiat_25519_tight_field_element[5];
 
 #if (-1 & 3) != 3
 #error "This code only works on a two's complement system"
 #endif
 
+#if !defined(FIAT_25519_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint64_t fiat_25519_value_barrier_u64(uint64_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_25519_value_barrier_u64(x) (x)
+#endif
+
 
 /*
  * The function fiat_25519_addcarryx_u51 is an addition with carry.
+ *
  * Postconditions:
  *   out1 = (arg1 + arg2 + arg3) mod 2^51
  *   out2 = ⌊(arg1 + arg2 + arg3) / 2^51⌋
@@ -33,16 +63,20 @@
  *   out1: [0x0 ~> 0x7ffffffffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
-  uint64_t x1 = ((arg1 + arg2) + arg3);
-  uint64_t x2 = (x1 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint1 x3 = (fiat_25519_uint1)(x1 >> 51);
+static FIAT_25519_FIAT_INLINE void fiat_25519_addcarryx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  uint64_t x1;
+  uint64_t x2;
+  fiat_25519_uint1 x3;
+  x1 = ((arg1 + arg2) + arg3);
+  x2 = (x1 & UINT64_C(0x7ffffffffffff));
+  x3 = (fiat_25519_uint1)(x1 >> 51);
   *out1 = x2;
   *out2 = x3;
 }
 
 /*
  * The function fiat_25519_subborrowx_u51 is a subtraction with borrow.
+ *
  * Postconditions:
  *   out1 = (-arg1 + arg2 + -arg3) mod 2^51
  *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^51⌋
@@ -55,16 +89,20 @@
  *   out1: [0x0 ~> 0x7ffffffffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
-  int64_t x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3);
-  fiat_25519_int1 x2 = (fiat_25519_int1)(x1 >> 51);
-  uint64_t x3 = (x1 & UINT64_C(0x7ffffffffffff));
+static FIAT_25519_FIAT_INLINE void fiat_25519_subborrowx_u51(uint64_t* out1, fiat_25519_uint1* out2, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  int64_t x1;
+  fiat_25519_int1 x2;
+  uint64_t x3;
+  x1 = ((int64_t)(arg2 - (int64_t)arg1) - (int64_t)arg3);
+  x2 = (fiat_25519_int1)(x1 >> 51);
+  x3 = (x1 & UINT64_C(0x7ffffffffffff));
   *out1 = x3;
   *out2 = (fiat_25519_uint1)(0x0 - x2);
 }
 
 /*
  * The function fiat_25519_cmovznz_u64 is a single-word conditional move.
+ *
  * Postconditions:
  *   out1 = (if arg1 = 0 then arg2 else arg3)
  *
@@ -75,83 +113,128 @@
  * Output Bounds:
  *   out1: [0x0 ~> 0xffffffffffffffff]
  */
-static void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
-  fiat_25519_uint1 x1 = (!(!arg1));
-  uint64_t x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
-  // Note this line has been patched from the synthesized code to add value
-  // barriers.
-  //
-  // Clang recognizes this pattern as a select. While it usually transforms it
-  // to a cmov, it sometimes further transforms it into a branch, which we do
-  // not want.
-  uint64_t x3 = ((value_barrier_u64(x2) & arg3) | (value_barrier_u64(~x2) & arg2));
+static FIAT_25519_FIAT_INLINE void fiat_25519_cmovznz_u64(uint64_t* out1, fiat_25519_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_25519_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_25519_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_25519_value_barrier_u64(x2) & arg3) | (fiat_25519_value_barrier_u64((~x2)) & arg2));
   *out1 = x3;
 }
 
 /*
  * The function fiat_25519_carry_mul multiplies two field elements and reduces the result.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 * eval arg2) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
- *   arg2: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
  */
-static void fiat_25519_carry_mul(uint64_t out1[5], const uint64_t arg1[5], const uint64_t arg2[5]) {
-  fiat_25519_uint128 x1 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[4]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x2 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[3]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x3 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[2]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x4 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[1]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x5 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[4]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x6 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[3]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x7 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[2]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x8 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[4]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x9 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[3]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x10 = ((fiat_25519_uint128)(arg1[1]) * ((arg2[4]) * UINT8_C(0x13)));
-  fiat_25519_uint128 x11 = ((fiat_25519_uint128)(arg1[4]) * (arg2[0]));
-  fiat_25519_uint128 x12 = ((fiat_25519_uint128)(arg1[3]) * (arg2[1]));
-  fiat_25519_uint128 x13 = ((fiat_25519_uint128)(arg1[3]) * (arg2[0]));
-  fiat_25519_uint128 x14 = ((fiat_25519_uint128)(arg1[2]) * (arg2[2]));
-  fiat_25519_uint128 x15 = ((fiat_25519_uint128)(arg1[2]) * (arg2[1]));
-  fiat_25519_uint128 x16 = ((fiat_25519_uint128)(arg1[2]) * (arg2[0]));
-  fiat_25519_uint128 x17 = ((fiat_25519_uint128)(arg1[1]) * (arg2[3]));
-  fiat_25519_uint128 x18 = ((fiat_25519_uint128)(arg1[1]) * (arg2[2]));
-  fiat_25519_uint128 x19 = ((fiat_25519_uint128)(arg1[1]) * (arg2[1]));
-  fiat_25519_uint128 x20 = ((fiat_25519_uint128)(arg1[1]) * (arg2[0]));
-  fiat_25519_uint128 x21 = ((fiat_25519_uint128)(arg1[0]) * (arg2[4]));
-  fiat_25519_uint128 x22 = ((fiat_25519_uint128)(arg1[0]) * (arg2[3]));
-  fiat_25519_uint128 x23 = ((fiat_25519_uint128)(arg1[0]) * (arg2[2]));
-  fiat_25519_uint128 x24 = ((fiat_25519_uint128)(arg1[0]) * (arg2[1]));
-  fiat_25519_uint128 x25 = ((fiat_25519_uint128)(arg1[0]) * (arg2[0]));
-  fiat_25519_uint128 x26 = (x25 + (x10 + (x9 + (x7 + x4))));
-  uint64_t x27 = (uint64_t)(x26 >> 51);
-  uint64_t x28 = (uint64_t)(x26 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x29 = (x21 + (x17 + (x14 + (x12 + x11))));
-  fiat_25519_uint128 x30 = (x22 + (x18 + (x15 + (x13 + x1))));
-  fiat_25519_uint128 x31 = (x23 + (x19 + (x16 + (x5 + x2))));
-  fiat_25519_uint128 x32 = (x24 + (x20 + (x8 + (x6 + x3))));
-  fiat_25519_uint128 x33 = (x27 + x32);
-  uint64_t x34 = (uint64_t)(x33 >> 51);
-  uint64_t x35 = (uint64_t)(x33 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x36 = (x34 + x31);
-  uint64_t x37 = (uint64_t)(x36 >> 51);
-  uint64_t x38 = (uint64_t)(x36 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x39 = (x37 + x30);
-  uint64_t x40 = (uint64_t)(x39 >> 51);
-  uint64_t x41 = (uint64_t)(x39 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x42 = (x40 + x29);
-  uint64_t x43 = (uint64_t)(x42 >> 51);
-  uint64_t x44 = (uint64_t)(x42 & UINT64_C(0x7ffffffffffff));
-  uint64_t x45 = (x43 * UINT8_C(0x13));
-  uint64_t x46 = (x28 + x45);
-  uint64_t x47 = (x46 >> 51);
-  uint64_t x48 = (x46 & UINT64_C(0x7ffffffffffff));
-  uint64_t x49 = (x47 + x35);
-  fiat_25519_uint1 x50 = (fiat_25519_uint1)(x49 >> 51);
-  uint64_t x51 = (x49 & UINT64_C(0x7ffffffffffff));
-  uint64_t x52 = (x50 + x38);
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_mul(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1, const fiat_25519_loose_field_element arg2) {
+  fiat_25519_uint128 x1;
+  fiat_25519_uint128 x2;
+  fiat_25519_uint128 x3;
+  fiat_25519_uint128 x4;
+  fiat_25519_uint128 x5;
+  fiat_25519_uint128 x6;
+  fiat_25519_uint128 x7;
+  fiat_25519_uint128 x8;
+  fiat_25519_uint128 x9;
+  fiat_25519_uint128 x10;
+  fiat_25519_uint128 x11;
+  fiat_25519_uint128 x12;
+  fiat_25519_uint128 x13;
+  fiat_25519_uint128 x14;
+  fiat_25519_uint128 x15;
+  fiat_25519_uint128 x16;
+  fiat_25519_uint128 x17;
+  fiat_25519_uint128 x18;
+  fiat_25519_uint128 x19;
+  fiat_25519_uint128 x20;
+  fiat_25519_uint128 x21;
+  fiat_25519_uint128 x22;
+  fiat_25519_uint128 x23;
+  fiat_25519_uint128 x24;
+  fiat_25519_uint128 x25;
+  fiat_25519_uint128 x26;
+  uint64_t x27;
+  uint64_t x28;
+  fiat_25519_uint128 x29;
+  fiat_25519_uint128 x30;
+  fiat_25519_uint128 x31;
+  fiat_25519_uint128 x32;
+  fiat_25519_uint128 x33;
+  uint64_t x34;
+  uint64_t x35;
+  fiat_25519_uint128 x36;
+  uint64_t x37;
+  uint64_t x38;
+  fiat_25519_uint128 x39;
+  uint64_t x40;
+  uint64_t x41;
+  fiat_25519_uint128 x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  fiat_25519_uint1 x50;
+  uint64_t x51;
+  uint64_t x52;
+  x1 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[4]) * UINT8_C(0x13)));
+  x2 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[3]) * UINT8_C(0x13)));
+  x3 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[2]) * UINT8_C(0x13)));
+  x4 = ((fiat_25519_uint128)(arg1[4]) * ((arg2[1]) * UINT8_C(0x13)));
+  x5 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[4]) * UINT8_C(0x13)));
+  x6 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[3]) * UINT8_C(0x13)));
+  x7 = ((fiat_25519_uint128)(arg1[3]) * ((arg2[2]) * UINT8_C(0x13)));
+  x8 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[4]) * UINT8_C(0x13)));
+  x9 = ((fiat_25519_uint128)(arg1[2]) * ((arg2[3]) * UINT8_C(0x13)));
+  x10 = ((fiat_25519_uint128)(arg1[1]) * ((arg2[4]) * UINT8_C(0x13)));
+  x11 = ((fiat_25519_uint128)(arg1[4]) * (arg2[0]));
+  x12 = ((fiat_25519_uint128)(arg1[3]) * (arg2[1]));
+  x13 = ((fiat_25519_uint128)(arg1[3]) * (arg2[0]));
+  x14 = ((fiat_25519_uint128)(arg1[2]) * (arg2[2]));
+  x15 = ((fiat_25519_uint128)(arg1[2]) * (arg2[1]));
+  x16 = ((fiat_25519_uint128)(arg1[2]) * (arg2[0]));
+  x17 = ((fiat_25519_uint128)(arg1[1]) * (arg2[3]));
+  x18 = ((fiat_25519_uint128)(arg1[1]) * (arg2[2]));
+  x19 = ((fiat_25519_uint128)(arg1[1]) * (arg2[1]));
+  x20 = ((fiat_25519_uint128)(arg1[1]) * (arg2[0]));
+  x21 = ((fiat_25519_uint128)(arg1[0]) * (arg2[4]));
+  x22 = ((fiat_25519_uint128)(arg1[0]) * (arg2[3]));
+  x23 = ((fiat_25519_uint128)(arg1[0]) * (arg2[2]));
+  x24 = ((fiat_25519_uint128)(arg1[0]) * (arg2[1]));
+  x25 = ((fiat_25519_uint128)(arg1[0]) * (arg2[0]));
+  x26 = (x25 + (x10 + (x9 + (x7 + x4))));
+  x27 = (uint64_t)(x26 >> 51);
+  x28 = (uint64_t)(x26 & UINT64_C(0x7ffffffffffff));
+  x29 = (x21 + (x17 + (x14 + (x12 + x11))));
+  x30 = (x22 + (x18 + (x15 + (x13 + x1))));
+  x31 = (x23 + (x19 + (x16 + (x5 + x2))));
+  x32 = (x24 + (x20 + (x8 + (x6 + x3))));
+  x33 = (x27 + x32);
+  x34 = (uint64_t)(x33 >> 51);
+  x35 = (uint64_t)(x33 & UINT64_C(0x7ffffffffffff));
+  x36 = (x34 + x31);
+  x37 = (uint64_t)(x36 >> 51);
+  x38 = (uint64_t)(x36 & UINT64_C(0x7ffffffffffff));
+  x39 = (x37 + x30);
+  x40 = (uint64_t)(x39 >> 51);
+  x41 = (uint64_t)(x39 & UINT64_C(0x7ffffffffffff));
+  x42 = (x40 + x29);
+  x43 = (uint64_t)(x42 >> 51);
+  x44 = (uint64_t)(x42 & UINT64_C(0x7ffffffffffff));
+  x45 = (x43 * UINT8_C(0x13));
+  x46 = (x28 + x45);
+  x47 = (x46 >> 51);
+  x48 = (x46 & UINT64_C(0x7ffffffffffff));
+  x49 = (x47 + x35);
+  x50 = (fiat_25519_uint1)(x49 >> 51);
+  x51 = (x49 & UINT64_C(0x7ffffffffffff));
+  x52 = (x50 + x38);
   out1[0] = x48;
   out1[1] = x51;
   out1[2] = x52;
@@ -161,65 +244,112 @@
 
 /*
  * The function fiat_25519_carry_square squares a field element and reduces the result.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 * eval arg1) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
  */
-static void fiat_25519_carry_square(uint64_t out1[5], const uint64_t arg1[5]) {
-  uint64_t x1 = ((arg1[4]) * UINT8_C(0x13));
-  uint64_t x2 = (x1 * 0x2);
-  uint64_t x3 = ((arg1[4]) * 0x2);
-  uint64_t x4 = ((arg1[3]) * UINT8_C(0x13));
-  uint64_t x5 = (x4 * 0x2);
-  uint64_t x6 = ((arg1[3]) * 0x2);
-  uint64_t x7 = ((arg1[2]) * 0x2);
-  uint64_t x8 = ((arg1[1]) * 0x2);
-  fiat_25519_uint128 x9 = ((fiat_25519_uint128)(arg1[4]) * x1);
-  fiat_25519_uint128 x10 = ((fiat_25519_uint128)(arg1[3]) * x2);
-  fiat_25519_uint128 x11 = ((fiat_25519_uint128)(arg1[3]) * x4);
-  fiat_25519_uint128 x12 = ((fiat_25519_uint128)(arg1[2]) * x2);
-  fiat_25519_uint128 x13 = ((fiat_25519_uint128)(arg1[2]) * x5);
-  fiat_25519_uint128 x14 = ((fiat_25519_uint128)(arg1[2]) * (arg1[2]));
-  fiat_25519_uint128 x15 = ((fiat_25519_uint128)(arg1[1]) * x2);
-  fiat_25519_uint128 x16 = ((fiat_25519_uint128)(arg1[1]) * x6);
-  fiat_25519_uint128 x17 = ((fiat_25519_uint128)(arg1[1]) * x7);
-  fiat_25519_uint128 x18 = ((fiat_25519_uint128)(arg1[1]) * (arg1[1]));
-  fiat_25519_uint128 x19 = ((fiat_25519_uint128)(arg1[0]) * x3);
-  fiat_25519_uint128 x20 = ((fiat_25519_uint128)(arg1[0]) * x6);
-  fiat_25519_uint128 x21 = ((fiat_25519_uint128)(arg1[0]) * x7);
-  fiat_25519_uint128 x22 = ((fiat_25519_uint128)(arg1[0]) * x8);
-  fiat_25519_uint128 x23 = ((fiat_25519_uint128)(arg1[0]) * (arg1[0]));
-  fiat_25519_uint128 x24 = (x23 + (x15 + x13));
-  uint64_t x25 = (uint64_t)(x24 >> 51);
-  uint64_t x26 = (uint64_t)(x24 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x27 = (x19 + (x16 + x14));
-  fiat_25519_uint128 x28 = (x20 + (x17 + x9));
-  fiat_25519_uint128 x29 = (x21 + (x18 + x10));
-  fiat_25519_uint128 x30 = (x22 + (x12 + x11));
-  fiat_25519_uint128 x31 = (x25 + x30);
-  uint64_t x32 = (uint64_t)(x31 >> 51);
-  uint64_t x33 = (uint64_t)(x31 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x34 = (x32 + x29);
-  uint64_t x35 = (uint64_t)(x34 >> 51);
-  uint64_t x36 = (uint64_t)(x34 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x37 = (x35 + x28);
-  uint64_t x38 = (uint64_t)(x37 >> 51);
-  uint64_t x39 = (uint64_t)(x37 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x40 = (x38 + x27);
-  uint64_t x41 = (uint64_t)(x40 >> 51);
-  uint64_t x42 = (uint64_t)(x40 & UINT64_C(0x7ffffffffffff));
-  uint64_t x43 = (x41 * UINT8_C(0x13));
-  uint64_t x44 = (x26 + x43);
-  uint64_t x45 = (x44 >> 51);
-  uint64_t x46 = (x44 & UINT64_C(0x7ffffffffffff));
-  uint64_t x47 = (x45 + x33);
-  fiat_25519_uint1 x48 = (fiat_25519_uint1)(x47 >> 51);
-  uint64_t x49 = (x47 & UINT64_C(0x7ffffffffffff));
-  uint64_t x50 = (x48 + x36);
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_square(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  fiat_25519_uint128 x9;
+  fiat_25519_uint128 x10;
+  fiat_25519_uint128 x11;
+  fiat_25519_uint128 x12;
+  fiat_25519_uint128 x13;
+  fiat_25519_uint128 x14;
+  fiat_25519_uint128 x15;
+  fiat_25519_uint128 x16;
+  fiat_25519_uint128 x17;
+  fiat_25519_uint128 x18;
+  fiat_25519_uint128 x19;
+  fiat_25519_uint128 x20;
+  fiat_25519_uint128 x21;
+  fiat_25519_uint128 x22;
+  fiat_25519_uint128 x23;
+  fiat_25519_uint128 x24;
+  uint64_t x25;
+  uint64_t x26;
+  fiat_25519_uint128 x27;
+  fiat_25519_uint128 x28;
+  fiat_25519_uint128 x29;
+  fiat_25519_uint128 x30;
+  fiat_25519_uint128 x31;
+  uint64_t x32;
+  uint64_t x33;
+  fiat_25519_uint128 x34;
+  uint64_t x35;
+  uint64_t x36;
+  fiat_25519_uint128 x37;
+  uint64_t x38;
+  uint64_t x39;
+  fiat_25519_uint128 x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  fiat_25519_uint1 x48;
+  uint64_t x49;
+  uint64_t x50;
+  x1 = ((arg1[4]) * UINT8_C(0x13));
+  x2 = (x1 * 0x2);
+  x3 = ((arg1[4]) * 0x2);
+  x4 = ((arg1[3]) * UINT8_C(0x13));
+  x5 = (x4 * 0x2);
+  x6 = ((arg1[3]) * 0x2);
+  x7 = ((arg1[2]) * 0x2);
+  x8 = ((arg1[1]) * 0x2);
+  x9 = ((fiat_25519_uint128)(arg1[4]) * x1);
+  x10 = ((fiat_25519_uint128)(arg1[3]) * x2);
+  x11 = ((fiat_25519_uint128)(arg1[3]) * x4);
+  x12 = ((fiat_25519_uint128)(arg1[2]) * x2);
+  x13 = ((fiat_25519_uint128)(arg1[2]) * x5);
+  x14 = ((fiat_25519_uint128)(arg1[2]) * (arg1[2]));
+  x15 = ((fiat_25519_uint128)(arg1[1]) * x2);
+  x16 = ((fiat_25519_uint128)(arg1[1]) * x6);
+  x17 = ((fiat_25519_uint128)(arg1[1]) * x7);
+  x18 = ((fiat_25519_uint128)(arg1[1]) * (arg1[1]));
+  x19 = ((fiat_25519_uint128)(arg1[0]) * x3);
+  x20 = ((fiat_25519_uint128)(arg1[0]) * x6);
+  x21 = ((fiat_25519_uint128)(arg1[0]) * x7);
+  x22 = ((fiat_25519_uint128)(arg1[0]) * x8);
+  x23 = ((fiat_25519_uint128)(arg1[0]) * (arg1[0]));
+  x24 = (x23 + (x15 + x13));
+  x25 = (uint64_t)(x24 >> 51);
+  x26 = (uint64_t)(x24 & UINT64_C(0x7ffffffffffff));
+  x27 = (x19 + (x16 + x14));
+  x28 = (x20 + (x17 + x9));
+  x29 = (x21 + (x18 + x10));
+  x30 = (x22 + (x12 + x11));
+  x31 = (x25 + x30);
+  x32 = (uint64_t)(x31 >> 51);
+  x33 = (uint64_t)(x31 & UINT64_C(0x7ffffffffffff));
+  x34 = (x32 + x29);
+  x35 = (uint64_t)(x34 >> 51);
+  x36 = (uint64_t)(x34 & UINT64_C(0x7ffffffffffff));
+  x37 = (x35 + x28);
+  x38 = (uint64_t)(x37 >> 51);
+  x39 = (uint64_t)(x37 & UINT64_C(0x7ffffffffffff));
+  x40 = (x38 + x27);
+  x41 = (uint64_t)(x40 >> 51);
+  x42 = (uint64_t)(x40 & UINT64_C(0x7ffffffffffff));
+  x43 = (x41 * UINT8_C(0x13));
+  x44 = (x26 + x43);
+  x45 = (x44 >> 51);
+  x46 = (x44 & UINT64_C(0x7ffffffffffff));
+  x47 = (x45 + x33);
+  x48 = (fiat_25519_uint1)(x47 >> 51);
+  x49 = (x47 & UINT64_C(0x7ffffffffffff));
+  x50 = (x48 + x36);
   out1[0] = x46;
   out1[1] = x49;
   out1[2] = x50;
@@ -229,27 +359,36 @@
 
 /*
  * The function fiat_25519_carry reduces a field element.
+ *
  * Postconditions:
  *   eval out1 mod m = eval arg1 mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
  */
-static void fiat_25519_carry(uint64_t out1[5], const uint64_t arg1[5]) {
-  uint64_t x1 = (arg1[0]);
-  uint64_t x2 = ((x1 >> 51) + (arg1[1]));
-  uint64_t x3 = ((x2 >> 51) + (arg1[2]));
-  uint64_t x4 = ((x3 >> 51) + (arg1[3]));
-  uint64_t x5 = ((x4 >> 51) + (arg1[4]));
-  uint64_t x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13)));
-  uint64_t x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff)));
-  uint64_t x8 = (x6 & UINT64_C(0x7ffffffffffff));
-  uint64_t x9 = (x7 & UINT64_C(0x7ffffffffffff));
-  uint64_t x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff)));
-  uint64_t x11 = (x4 & UINT64_C(0x7ffffffffffff));
-  uint64_t x12 = (x5 & UINT64_C(0x7ffffffffffff));
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  x1 = (arg1[0]);
+  x2 = ((x1 >> 51) + (arg1[1]));
+  x3 = ((x2 >> 51) + (arg1[2]));
+  x4 = ((x3 >> 51) + (arg1[3]));
+  x5 = ((x4 >> 51) + (arg1[4]));
+  x6 = ((x1 & UINT64_C(0x7ffffffffffff)) + ((x5 >> 51) * UINT8_C(0x13)));
+  x7 = ((fiat_25519_uint1)(x6 >> 51) + (x2 & UINT64_C(0x7ffffffffffff)));
+  x8 = (x6 & UINT64_C(0x7ffffffffffff));
+  x9 = (x7 & UINT64_C(0x7ffffffffffff));
+  x10 = ((fiat_25519_uint1)(x7 >> 51) + (x3 & UINT64_C(0x7ffffffffffff)));
+  x11 = (x4 & UINT64_C(0x7ffffffffffff));
+  x12 = (x5 & UINT64_C(0x7ffffffffffff));
   out1[0] = x8;
   out1[1] = x9;
   out1[2] = x10;
@@ -259,21 +398,22 @@
 
 /*
  * The function fiat_25519_add adds two field elements.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 + eval arg2) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
- *   arg2: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
  */
-static void fiat_25519_add(uint64_t out1[5], const uint64_t arg1[5], const uint64_t arg2[5]) {
-  uint64_t x1 = ((arg1[0]) + (arg2[0]));
-  uint64_t x2 = ((arg1[1]) + (arg2[1]));
-  uint64_t x3 = ((arg1[2]) + (arg2[2]));
-  uint64_t x4 = ((arg1[3]) + (arg2[3]));
-  uint64_t x5 = ((arg1[4]) + (arg2[4]));
+static FIAT_25519_FIAT_INLINE void fiat_25519_add(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = ((arg1[0]) + (arg2[0]));
+  x2 = ((arg1[1]) + (arg2[1]));
+  x3 = ((arg1[2]) + (arg2[2]));
+  x4 = ((arg1[3]) + (arg2[3]));
+  x5 = ((arg1[4]) + (arg2[4]));
   out1[0] = x1;
   out1[1] = x2;
   out1[2] = x3;
@@ -283,21 +423,22 @@
 
 /*
  * The function fiat_25519_sub subtracts two field elements.
+ *
  * Postconditions:
  *   eval out1 mod m = (eval arg1 - eval arg2) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
- *   arg2: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
  */
-static void fiat_25519_sub(uint64_t out1[5], const uint64_t arg1[5], const uint64_t arg2[5]) {
-  uint64_t x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0]));
-  uint64_t x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1]));
-  uint64_t x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2]));
-  uint64_t x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3]));
-  uint64_t x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4]));
+static FIAT_25519_FIAT_INLINE void fiat_25519_sub(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1, const fiat_25519_tight_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = ((UINT64_C(0xfffffffffffda) + (arg1[0])) - (arg2[0]));
+  x2 = ((UINT64_C(0xffffffffffffe) + (arg1[1])) - (arg2[1]));
+  x3 = ((UINT64_C(0xffffffffffffe) + (arg1[2])) - (arg2[2]));
+  x4 = ((UINT64_C(0xffffffffffffe) + (arg1[3])) - (arg2[3]));
+  x5 = ((UINT64_C(0xffffffffffffe) + (arg1[4])) - (arg2[4]));
   out1[0] = x1;
   out1[1] = x2;
   out1[2] = x3;
@@ -307,20 +448,22 @@
 
 /*
  * The function fiat_25519_opp negates a field element.
+ *
  * Postconditions:
  *   eval out1 mod m = -eval arg1 mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
  */
-static void fiat_25519_opp(uint64_t out1[5], const uint64_t arg1[5]) {
-  uint64_t x1 = (UINT64_C(0xfffffffffffda) - (arg1[0]));
-  uint64_t x2 = (UINT64_C(0xffffffffffffe) - (arg1[1]));
-  uint64_t x3 = (UINT64_C(0xffffffffffffe) - (arg1[2]));
-  uint64_t x4 = (UINT64_C(0xffffffffffffe) - (arg1[3]));
-  uint64_t x5 = (UINT64_C(0xffffffffffffe) - (arg1[4]));
+static FIAT_25519_FIAT_INLINE void fiat_25519_opp(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = (UINT64_C(0xfffffffffffda) - (arg1[0]));
+  x2 = (UINT64_C(0xffffffffffffe) - (arg1[1]));
+  x3 = (UINT64_C(0xffffffffffffe) - (arg1[2]));
+  x4 = (UINT64_C(0xffffffffffffe) - (arg1[3]));
+  x5 = (UINT64_C(0xffffffffffffe) - (arg1[4]));
   out1[0] = x1;
   out1[1] = x2;
   out1[2] = x3;
@@ -330,6 +473,7 @@
 
 /*
  * The function fiat_25519_selectznz is a multi-limb conditional select.
+ *
  * Postconditions:
  *   eval out1 = (if arg1 = 0 then eval arg2 else eval arg3)
  *
@@ -340,16 +484,16 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_25519_selectznz(uint64_t out1[5], fiat_25519_uint1 arg1, const uint64_t arg2[5], const uint64_t arg3[5]) {
+static FIAT_25519_FIAT_INLINE void fiat_25519_selectznz(uint64_t out1[5], fiat_25519_uint1 arg1, const uint64_t arg2[5], const uint64_t arg3[5]) {
   uint64_t x1;
-  fiat_25519_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
   uint64_t x2;
-  fiat_25519_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
   uint64_t x3;
-  fiat_25519_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
   uint64_t x4;
-  fiat_25519_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3]));
   uint64_t x5;
+  fiat_25519_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_25519_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_25519_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_25519_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3]));
   fiat_25519_cmovznz_u64(&x5, arg1, (arg2[4]), (arg3[4]));
   out1[0] = x1;
   out1[1] = x2;
@@ -360,260 +504,469 @@
 
 /*
  * The function fiat_25519_to_bytes serializes a field element to bytes in little-endian order.
+ *
  * Postconditions:
  *   out1 = map (λ x, ⌊((eval arg1 mod m) mod 2^(8 * (x + 1))) / 2^(8 * x)⌋) [0..31]
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
  * Output Bounds:
  *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
  */
-static void fiat_25519_to_bytes(uint8_t out1[32], const uint64_t arg1[5]) {
+static FIAT_25519_FIAT_INLINE void fiat_25519_to_bytes(uint8_t out1[32], const fiat_25519_tight_field_element arg1) {
   uint64_t x1;
   fiat_25519_uint1 x2;
-  fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed));
   uint64_t x3;
   fiat_25519_uint1 x4;
-  fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff));
   uint64_t x5;
   fiat_25519_uint1 x6;
-  fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff));
   uint64_t x7;
   fiat_25519_uint1 x8;
-  fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff));
   uint64_t x9;
   fiat_25519_uint1 x10;
-  fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff));
   uint64_t x11;
-  fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff));
   uint64_t x12;
   fiat_25519_uint1 x13;
-  fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed)));
   uint64_t x14;
   fiat_25519_uint1 x15;
-  fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff)));
   uint64_t x16;
   fiat_25519_uint1 x17;
-  fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff)));
   uint64_t x18;
   fiat_25519_uint1 x19;
-  fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff)));
   uint64_t x20;
   fiat_25519_uint1 x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint8_t x26;
+  uint64_t x27;
+  uint8_t x28;
+  uint64_t x29;
+  uint8_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint8_t x34;
+  uint64_t x35;
+  uint8_t x36;
+  uint8_t x37;
+  uint64_t x38;
+  uint8_t x39;
+  uint64_t x40;
+  uint8_t x41;
+  uint64_t x42;
+  uint8_t x43;
+  uint64_t x44;
+  uint8_t x45;
+  uint64_t x46;
+  uint8_t x47;
+  uint64_t x48;
+  uint8_t x49;
+  uint8_t x50;
+  uint64_t x51;
+  uint8_t x52;
+  uint64_t x53;
+  uint8_t x54;
+  uint64_t x55;
+  uint8_t x56;
+  uint64_t x57;
+  uint8_t x58;
+  uint64_t x59;
+  uint8_t x60;
+  uint64_t x61;
+  uint8_t x62;
+  uint64_t x63;
+  uint8_t x64;
+  fiat_25519_uint1 x65;
+  uint64_t x66;
+  uint8_t x67;
+  uint64_t x68;
+  uint8_t x69;
+  uint64_t x70;
+  uint8_t x71;
+  uint64_t x72;
+  uint8_t x73;
+  uint64_t x74;
+  uint8_t x75;
+  uint64_t x76;
+  uint8_t x77;
+  uint8_t x78;
+  uint64_t x79;
+  uint8_t x80;
+  uint64_t x81;
+  uint8_t x82;
+  uint64_t x83;
+  uint8_t x84;
+  uint64_t x85;
+  uint8_t x86;
+  uint64_t x87;
+  uint8_t x88;
+  uint64_t x89;
+  uint8_t x90;
+  uint8_t x91;
+  fiat_25519_subborrowx_u51(&x1, &x2, 0x0, (arg1[0]), UINT64_C(0x7ffffffffffed));
+  fiat_25519_subborrowx_u51(&x3, &x4, x2, (arg1[1]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x5, &x6, x4, (arg1[2]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x7, &x8, x6, (arg1[3]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_subborrowx_u51(&x9, &x10, x8, (arg1[4]), UINT64_C(0x7ffffffffffff));
+  fiat_25519_cmovznz_u64(&x11, x10, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_25519_addcarryx_u51(&x12, &x13, 0x0, x1, (x11 & UINT64_C(0x7ffffffffffed)));
+  fiat_25519_addcarryx_u51(&x14, &x15, x13, x3, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x16, &x17, x15, x5, (x11 & UINT64_C(0x7ffffffffffff)));
+  fiat_25519_addcarryx_u51(&x18, &x19, x17, x7, (x11 & UINT64_C(0x7ffffffffffff)));
   fiat_25519_addcarryx_u51(&x20, &x21, x19, x9, (x11 & UINT64_C(0x7ffffffffffff)));
-  uint64_t x22 = (x20 << 4);
-  uint64_t x23 = (x18 * (uint64_t)0x2);
-  uint64_t x24 = (x16 << 6);
-  uint64_t x25 = (x14 << 3);
-  uint64_t x26 = (x12 >> 8);
-  uint8_t x27 = (uint8_t)(x12 & UINT8_C(0xff));
-  uint64_t x28 = (x26 >> 8);
-  uint8_t x29 = (uint8_t)(x26 & UINT8_C(0xff));
-  uint64_t x30 = (x28 >> 8);
-  uint8_t x31 = (uint8_t)(x28 & UINT8_C(0xff));
-  uint64_t x32 = (x30 >> 8);
-  uint8_t x33 = (uint8_t)(x30 & UINT8_C(0xff));
-  uint64_t x34 = (x32 >> 8);
-  uint8_t x35 = (uint8_t)(x32 & UINT8_C(0xff));
-  uint8_t x36 = (uint8_t)(x34 >> 8);
-  uint8_t x37 = (uint8_t)(x34 & UINT8_C(0xff));
-  uint64_t x38 = (x36 + x25);
-  uint64_t x39 = (x38 >> 8);
-  uint8_t x40 = (uint8_t)(x38 & UINT8_C(0xff));
-  uint64_t x41 = (x39 >> 8);
-  uint8_t x42 = (uint8_t)(x39 & UINT8_C(0xff));
-  uint64_t x43 = (x41 >> 8);
-  uint8_t x44 = (uint8_t)(x41 & UINT8_C(0xff));
-  uint64_t x45 = (x43 >> 8);
-  uint8_t x46 = (uint8_t)(x43 & UINT8_C(0xff));
-  uint64_t x47 = (x45 >> 8);
-  uint8_t x48 = (uint8_t)(x45 & UINT8_C(0xff));
-  uint8_t x49 = (uint8_t)(x47 >> 8);
-  uint8_t x50 = (uint8_t)(x47 & UINT8_C(0xff));
-  uint64_t x51 = (x49 + x24);
-  uint64_t x52 = (x51 >> 8);
-  uint8_t x53 = (uint8_t)(x51 & UINT8_C(0xff));
-  uint64_t x54 = (x52 >> 8);
-  uint8_t x55 = (uint8_t)(x52 & UINT8_C(0xff));
-  uint64_t x56 = (x54 >> 8);
-  uint8_t x57 = (uint8_t)(x54 & UINT8_C(0xff));
-  uint64_t x58 = (x56 >> 8);
-  uint8_t x59 = (uint8_t)(x56 & UINT8_C(0xff));
-  uint64_t x60 = (x58 >> 8);
-  uint8_t x61 = (uint8_t)(x58 & UINT8_C(0xff));
-  uint64_t x62 = (x60 >> 8);
-  uint8_t x63 = (uint8_t)(x60 & UINT8_C(0xff));
-  fiat_25519_uint1 x64 = (fiat_25519_uint1)(x62 >> 8);
-  uint8_t x65 = (uint8_t)(x62 & UINT8_C(0xff));
-  uint64_t x66 = (x64 + x23);
-  uint64_t x67 = (x66 >> 8);
-  uint8_t x68 = (uint8_t)(x66 & UINT8_C(0xff));
-  uint64_t x69 = (x67 >> 8);
-  uint8_t x70 = (uint8_t)(x67 & UINT8_C(0xff));
-  uint64_t x71 = (x69 >> 8);
-  uint8_t x72 = (uint8_t)(x69 & UINT8_C(0xff));
-  uint64_t x73 = (x71 >> 8);
-  uint8_t x74 = (uint8_t)(x71 & UINT8_C(0xff));
-  uint64_t x75 = (x73 >> 8);
-  uint8_t x76 = (uint8_t)(x73 & UINT8_C(0xff));
-  uint8_t x77 = (uint8_t)(x75 >> 8);
-  uint8_t x78 = (uint8_t)(x75 & UINT8_C(0xff));
-  uint64_t x79 = (x77 + x22);
-  uint64_t x80 = (x79 >> 8);
-  uint8_t x81 = (uint8_t)(x79 & UINT8_C(0xff));
-  uint64_t x82 = (x80 >> 8);
-  uint8_t x83 = (uint8_t)(x80 & UINT8_C(0xff));
-  uint64_t x84 = (x82 >> 8);
-  uint8_t x85 = (uint8_t)(x82 & UINT8_C(0xff));
-  uint64_t x86 = (x84 >> 8);
-  uint8_t x87 = (uint8_t)(x84 & UINT8_C(0xff));
-  uint64_t x88 = (x86 >> 8);
-  uint8_t x89 = (uint8_t)(x86 & UINT8_C(0xff));
-  uint8_t x90 = (uint8_t)(x88 >> 8);
-  uint8_t x91 = (uint8_t)(x88 & UINT8_C(0xff));
-  out1[0] = x27;
-  out1[1] = x29;
-  out1[2] = x31;
-  out1[3] = x33;
-  out1[4] = x35;
-  out1[5] = x37;
-  out1[6] = x40;
-  out1[7] = x42;
-  out1[8] = x44;
-  out1[9] = x46;
-  out1[10] = x48;
-  out1[11] = x50;
-  out1[12] = x53;
-  out1[13] = x55;
-  out1[14] = x57;
-  out1[15] = x59;
-  out1[16] = x61;
-  out1[17] = x63;
-  out1[18] = x65;
-  out1[19] = x68;
-  out1[20] = x70;
-  out1[21] = x72;
-  out1[22] = x74;
-  out1[23] = x76;
-  out1[24] = x78;
-  out1[25] = x81;
-  out1[26] = x83;
-  out1[27] = x85;
-  out1[28] = x87;
-  out1[29] = x89;
-  out1[30] = x91;
-  out1[31] = x90;
+  x22 = (x20 << 4);
+  x23 = (x18 * (uint64_t)0x2);
+  x24 = (x16 << 6);
+  x25 = (x14 << 3);
+  x26 = (uint8_t)(x12 & UINT8_C(0xff));
+  x27 = (x12 >> 8);
+  x28 = (uint8_t)(x27 & UINT8_C(0xff));
+  x29 = (x27 >> 8);
+  x30 = (uint8_t)(x29 & UINT8_C(0xff));
+  x31 = (x29 >> 8);
+  x32 = (uint8_t)(x31 & UINT8_C(0xff));
+  x33 = (x31 >> 8);
+  x34 = (uint8_t)(x33 & UINT8_C(0xff));
+  x35 = (x33 >> 8);
+  x36 = (uint8_t)(x35 & UINT8_C(0xff));
+  x37 = (uint8_t)(x35 >> 8);
+  x38 = (x25 + (uint64_t)x37);
+  x39 = (uint8_t)(x38 & UINT8_C(0xff));
+  x40 = (x38 >> 8);
+  x41 = (uint8_t)(x40 & UINT8_C(0xff));
+  x42 = (x40 >> 8);
+  x43 = (uint8_t)(x42 & UINT8_C(0xff));
+  x44 = (x42 >> 8);
+  x45 = (uint8_t)(x44 & UINT8_C(0xff));
+  x46 = (x44 >> 8);
+  x47 = (uint8_t)(x46 & UINT8_C(0xff));
+  x48 = (x46 >> 8);
+  x49 = (uint8_t)(x48 & UINT8_C(0xff));
+  x50 = (uint8_t)(x48 >> 8);
+  x51 = (x24 + (uint64_t)x50);
+  x52 = (uint8_t)(x51 & UINT8_C(0xff));
+  x53 = (x51 >> 8);
+  x54 = (uint8_t)(x53 & UINT8_C(0xff));
+  x55 = (x53 >> 8);
+  x56 = (uint8_t)(x55 & UINT8_C(0xff));
+  x57 = (x55 >> 8);
+  x58 = (uint8_t)(x57 & UINT8_C(0xff));
+  x59 = (x57 >> 8);
+  x60 = (uint8_t)(x59 & UINT8_C(0xff));
+  x61 = (x59 >> 8);
+  x62 = (uint8_t)(x61 & UINT8_C(0xff));
+  x63 = (x61 >> 8);
+  x64 = (uint8_t)(x63 & UINT8_C(0xff));
+  x65 = (fiat_25519_uint1)(x63 >> 8);
+  x66 = (x23 + (uint64_t)x65);
+  x67 = (uint8_t)(x66 & UINT8_C(0xff));
+  x68 = (x66 >> 8);
+  x69 = (uint8_t)(x68 & UINT8_C(0xff));
+  x70 = (x68 >> 8);
+  x71 = (uint8_t)(x70 & UINT8_C(0xff));
+  x72 = (x70 >> 8);
+  x73 = (uint8_t)(x72 & UINT8_C(0xff));
+  x74 = (x72 >> 8);
+  x75 = (uint8_t)(x74 & UINT8_C(0xff));
+  x76 = (x74 >> 8);
+  x77 = (uint8_t)(x76 & UINT8_C(0xff));
+  x78 = (uint8_t)(x76 >> 8);
+  x79 = (x22 + (uint64_t)x78);
+  x80 = (uint8_t)(x79 & UINT8_C(0xff));
+  x81 = (x79 >> 8);
+  x82 = (uint8_t)(x81 & UINT8_C(0xff));
+  x83 = (x81 >> 8);
+  x84 = (uint8_t)(x83 & UINT8_C(0xff));
+  x85 = (x83 >> 8);
+  x86 = (uint8_t)(x85 & UINT8_C(0xff));
+  x87 = (x85 >> 8);
+  x88 = (uint8_t)(x87 & UINT8_C(0xff));
+  x89 = (x87 >> 8);
+  x90 = (uint8_t)(x89 & UINT8_C(0xff));
+  x91 = (uint8_t)(x89 >> 8);
+  out1[0] = x26;
+  out1[1] = x28;
+  out1[2] = x30;
+  out1[3] = x32;
+  out1[4] = x34;
+  out1[5] = x36;
+  out1[6] = x39;
+  out1[7] = x41;
+  out1[8] = x43;
+  out1[9] = x45;
+  out1[10] = x47;
+  out1[11] = x49;
+  out1[12] = x52;
+  out1[13] = x54;
+  out1[14] = x56;
+  out1[15] = x58;
+  out1[16] = x60;
+  out1[17] = x62;
+  out1[18] = x64;
+  out1[19] = x67;
+  out1[20] = x69;
+  out1[21] = x71;
+  out1[22] = x73;
+  out1[23] = x75;
+  out1[24] = x77;
+  out1[25] = x80;
+  out1[26] = x82;
+  out1[27] = x84;
+  out1[28] = x86;
+  out1[29] = x88;
+  out1[30] = x90;
+  out1[31] = x91;
 }
 
 /*
  * The function fiat_25519_from_bytes deserializes a field element from bytes in little-endian order.
+ *
  * Postconditions:
  *   eval out1 mod m = bytes_eval arg1 mod m
  *
  * Input Bounds:
  *   arg1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0x7f]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
  */
-static void fiat_25519_from_bytes(uint64_t out1[5], const uint8_t arg1[32]) {
-  uint64_t x1 = ((uint64_t)(arg1[31]) << 44);
-  uint64_t x2 = ((uint64_t)(arg1[30]) << 36);
-  uint64_t x3 = ((uint64_t)(arg1[29]) << 28);
-  uint64_t x4 = ((uint64_t)(arg1[28]) << 20);
-  uint64_t x5 = ((uint64_t)(arg1[27]) << 12);
-  uint64_t x6 = ((uint64_t)(arg1[26]) << 4);
-  uint64_t x7 = ((uint64_t)(arg1[25]) << 47);
-  uint64_t x8 = ((uint64_t)(arg1[24]) << 39);
-  uint64_t x9 = ((uint64_t)(arg1[23]) << 31);
-  uint64_t x10 = ((uint64_t)(arg1[22]) << 23);
-  uint64_t x11 = ((uint64_t)(arg1[21]) << 15);
-  uint64_t x12 = ((uint64_t)(arg1[20]) << 7);
-  uint64_t x13 = ((uint64_t)(arg1[19]) << 50);
-  uint64_t x14 = ((uint64_t)(arg1[18]) << 42);
-  uint64_t x15 = ((uint64_t)(arg1[17]) << 34);
-  uint64_t x16 = ((uint64_t)(arg1[16]) << 26);
-  uint64_t x17 = ((uint64_t)(arg1[15]) << 18);
-  uint64_t x18 = ((uint64_t)(arg1[14]) << 10);
-  uint64_t x19 = ((uint64_t)(arg1[13]) << 2);
-  uint64_t x20 = ((uint64_t)(arg1[12]) << 45);
-  uint64_t x21 = ((uint64_t)(arg1[11]) << 37);
-  uint64_t x22 = ((uint64_t)(arg1[10]) << 29);
-  uint64_t x23 = ((uint64_t)(arg1[9]) << 21);
-  uint64_t x24 = ((uint64_t)(arg1[8]) << 13);
-  uint64_t x25 = ((uint64_t)(arg1[7]) << 5);
-  uint64_t x26 = ((uint64_t)(arg1[6]) << 48);
-  uint64_t x27 = ((uint64_t)(arg1[5]) << 40);
-  uint64_t x28 = ((uint64_t)(arg1[4]) << 32);
-  uint64_t x29 = ((uint64_t)(arg1[3]) << 24);
-  uint64_t x30 = ((uint64_t)(arg1[2]) << 16);
-  uint64_t x31 = ((uint64_t)(arg1[1]) << 8);
-  uint8_t x32 = (arg1[0]);
-  uint64_t x33 = (x32 + (x31 + (x30 + (x29 + (x28 + (x27 + x26))))));
-  uint8_t x34 = (uint8_t)(x33 >> 51);
-  uint64_t x35 = (x33 & UINT64_C(0x7ffffffffffff));
-  uint64_t x36 = (x6 + (x5 + (x4 + (x3 + (x2 + x1)))));
-  uint64_t x37 = (x12 + (x11 + (x10 + (x9 + (x8 + x7)))));
-  uint64_t x38 = (x19 + (x18 + (x17 + (x16 + (x15 + (x14 + x13))))));
-  uint64_t x39 = (x25 + (x24 + (x23 + (x22 + (x21 + x20)))));
-  uint64_t x40 = (x34 + x39);
-  uint8_t x41 = (uint8_t)(x40 >> 51);
-  uint64_t x42 = (x40 & UINT64_C(0x7ffffffffffff));
-  uint64_t x43 = (x41 + x38);
-  uint8_t x44 = (uint8_t)(x43 >> 51);
-  uint64_t x45 = (x43 & UINT64_C(0x7ffffffffffff));
-  uint64_t x46 = (x44 + x37);
-  uint8_t x47 = (uint8_t)(x46 >> 51);
-  uint64_t x48 = (x46 & UINT64_C(0x7ffffffffffff));
-  uint64_t x49 = (x47 + x36);
-  out1[0] = x35;
-  out1[1] = x42;
-  out1[2] = x45;
-  out1[3] = x48;
-  out1[4] = x49;
+static FIAT_25519_FIAT_INLINE void fiat_25519_from_bytes(fiat_25519_tight_field_element out1, const uint8_t arg1[32]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint64_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint8_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint8_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint8_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint8_t x65;
+  uint64_t x66;
+  uint64_t x67;
+  uint64_t x68;
+  uint64_t x69;
+  uint64_t x70;
+  uint64_t x71;
+  x1 = ((uint64_t)(arg1[31]) << 44);
+  x2 = ((uint64_t)(arg1[30]) << 36);
+  x3 = ((uint64_t)(arg1[29]) << 28);
+  x4 = ((uint64_t)(arg1[28]) << 20);
+  x5 = ((uint64_t)(arg1[27]) << 12);
+  x6 = ((uint64_t)(arg1[26]) << 4);
+  x7 = ((uint64_t)(arg1[25]) << 47);
+  x8 = ((uint64_t)(arg1[24]) << 39);
+  x9 = ((uint64_t)(arg1[23]) << 31);
+  x10 = ((uint64_t)(arg1[22]) << 23);
+  x11 = ((uint64_t)(arg1[21]) << 15);
+  x12 = ((uint64_t)(arg1[20]) << 7);
+  x13 = ((uint64_t)(arg1[19]) << 50);
+  x14 = ((uint64_t)(arg1[18]) << 42);
+  x15 = ((uint64_t)(arg1[17]) << 34);
+  x16 = ((uint64_t)(arg1[16]) << 26);
+  x17 = ((uint64_t)(arg1[15]) << 18);
+  x18 = ((uint64_t)(arg1[14]) << 10);
+  x19 = ((uint64_t)(arg1[13]) << 2);
+  x20 = ((uint64_t)(arg1[12]) << 45);
+  x21 = ((uint64_t)(arg1[11]) << 37);
+  x22 = ((uint64_t)(arg1[10]) << 29);
+  x23 = ((uint64_t)(arg1[9]) << 21);
+  x24 = ((uint64_t)(arg1[8]) << 13);
+  x25 = ((uint64_t)(arg1[7]) << 5);
+  x26 = ((uint64_t)(arg1[6]) << 48);
+  x27 = ((uint64_t)(arg1[5]) << 40);
+  x28 = ((uint64_t)(arg1[4]) << 32);
+  x29 = ((uint64_t)(arg1[3]) << 24);
+  x30 = ((uint64_t)(arg1[2]) << 16);
+  x31 = ((uint64_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint64_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x28 + x35);
+  x37 = (x27 + x36);
+  x38 = (x26 + x37);
+  x39 = (x38 & UINT64_C(0x7ffffffffffff));
+  x40 = (uint8_t)(x38 >> 51);
+  x41 = (x25 + (uint64_t)x40);
+  x42 = (x24 + x41);
+  x43 = (x23 + x42);
+  x44 = (x22 + x43);
+  x45 = (x21 + x44);
+  x46 = (x20 + x45);
+  x47 = (x46 & UINT64_C(0x7ffffffffffff));
+  x48 = (uint8_t)(x46 >> 51);
+  x49 = (x19 + (uint64_t)x48);
+  x50 = (x18 + x49);
+  x51 = (x17 + x50);
+  x52 = (x16 + x51);
+  x53 = (x15 + x52);
+  x54 = (x14 + x53);
+  x55 = (x13 + x54);
+  x56 = (x55 & UINT64_C(0x7ffffffffffff));
+  x57 = (uint8_t)(x55 >> 51);
+  x58 = (x12 + (uint64_t)x57);
+  x59 = (x11 + x58);
+  x60 = (x10 + x59);
+  x61 = (x9 + x60);
+  x62 = (x8 + x61);
+  x63 = (x7 + x62);
+  x64 = (x63 & UINT64_C(0x7ffffffffffff));
+  x65 = (uint8_t)(x63 >> 51);
+  x66 = (x6 + (uint64_t)x65);
+  x67 = (x5 + x66);
+  x68 = (x4 + x67);
+  x69 = (x3 + x68);
+  x70 = (x2 + x69);
+  x71 = (x1 + x70);
+  out1[0] = x39;
+  out1[1] = x47;
+  out1[2] = x56;
+  out1[3] = x64;
+  out1[4] = x71;
+}
+
+/*
+ * The function fiat_25519_relax is the identity function converting from tight field elements to loose field elements.
+ *
+ * Postconditions:
+ *   out1 = arg1
+ *
+ */
+static FIAT_25519_FIAT_INLINE void fiat_25519_relax(fiat_25519_loose_field_element out1, const fiat_25519_tight_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  x1 = (arg1[0]);
+  x2 = (arg1[1]);
+  x3 = (arg1[2]);
+  x4 = (arg1[3]);
+  x5 = (arg1[4]);
+  out1[0] = x1;
+  out1[1] = x2;
+  out1[2] = x3;
+  out1[3] = x4;
+  out1[4] = x5;
 }
 
 /*
  * The function fiat_25519_carry_scmul_121666 multiplies a field element by 121666 and reduces the result.
+ *
  * Postconditions:
  *   eval out1 mod m = (121666 * eval arg1) mod m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664], [0x0 ~> 0x1a666666666664]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc], [0x0 ~> 0x8cccccccccccc]]
  */
-static void fiat_25519_carry_scmul_121666(uint64_t out1[5], const uint64_t arg1[5]) {
-  fiat_25519_uint128 x1 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[4]));
-  fiat_25519_uint128 x2 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[3]));
-  fiat_25519_uint128 x3 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[2]));
-  fiat_25519_uint128 x4 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[1]));
-  fiat_25519_uint128 x5 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[0]));
-  uint64_t x6 = (uint64_t)(x5 >> 51);
-  uint64_t x7 = (uint64_t)(x5 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x8 = (x6 + x4);
-  uint64_t x9 = (uint64_t)(x8 >> 51);
-  uint64_t x10 = (uint64_t)(x8 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x11 = (x9 + x3);
-  uint64_t x12 = (uint64_t)(x11 >> 51);
-  uint64_t x13 = (uint64_t)(x11 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x14 = (x12 + x2);
-  uint64_t x15 = (uint64_t)(x14 >> 51);
-  uint64_t x16 = (uint64_t)(x14 & UINT64_C(0x7ffffffffffff));
-  fiat_25519_uint128 x17 = (x15 + x1);
-  uint64_t x18 = (uint64_t)(x17 >> 51);
-  uint64_t x19 = (uint64_t)(x17 & UINT64_C(0x7ffffffffffff));
-  uint64_t x20 = (x18 * UINT8_C(0x13));
-  uint64_t x21 = (x7 + x20);
-  fiat_25519_uint1 x22 = (fiat_25519_uint1)(x21 >> 51);
-  uint64_t x23 = (x21 & UINT64_C(0x7ffffffffffff));
-  uint64_t x24 = (x22 + x10);
-  fiat_25519_uint1 x25 = (fiat_25519_uint1)(x24 >> 51);
-  uint64_t x26 = (x24 & UINT64_C(0x7ffffffffffff));
-  uint64_t x27 = (x25 + x13);
+static FIAT_25519_FIAT_INLINE void fiat_25519_carry_scmul_121666(fiat_25519_tight_field_element out1, const fiat_25519_loose_field_element arg1) {
+  fiat_25519_uint128 x1;
+  fiat_25519_uint128 x2;
+  fiat_25519_uint128 x3;
+  fiat_25519_uint128 x4;
+  fiat_25519_uint128 x5;
+  uint64_t x6;
+  uint64_t x7;
+  fiat_25519_uint128 x8;
+  uint64_t x9;
+  uint64_t x10;
+  fiat_25519_uint128 x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_25519_uint128 x14;
+  uint64_t x15;
+  uint64_t x16;
+  fiat_25519_uint128 x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  fiat_25519_uint1 x22;
+  uint64_t x23;
+  uint64_t x24;
+  fiat_25519_uint1 x25;
+  uint64_t x26;
+  uint64_t x27;
+  x1 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[4]));
+  x2 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[3]));
+  x3 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[2]));
+  x4 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[1]));
+  x5 = ((fiat_25519_uint128)UINT32_C(0x1db42) * (arg1[0]));
+  x6 = (uint64_t)(x5 >> 51);
+  x7 = (uint64_t)(x5 & UINT64_C(0x7ffffffffffff));
+  x8 = (x6 + x4);
+  x9 = (uint64_t)(x8 >> 51);
+  x10 = (uint64_t)(x8 & UINT64_C(0x7ffffffffffff));
+  x11 = (x9 + x3);
+  x12 = (uint64_t)(x11 >> 51);
+  x13 = (uint64_t)(x11 & UINT64_C(0x7ffffffffffff));
+  x14 = (x12 + x2);
+  x15 = (uint64_t)(x14 >> 51);
+  x16 = (uint64_t)(x14 & UINT64_C(0x7ffffffffffff));
+  x17 = (x15 + x1);
+  x18 = (uint64_t)(x17 >> 51);
+  x19 = (uint64_t)(x17 & UINT64_C(0x7ffffffffffff));
+  x20 = (x18 * UINT8_C(0x13));
+  x21 = (x7 + x20);
+  x22 = (fiat_25519_uint1)(x21 >> 51);
+  x23 = (x21 & UINT64_C(0x7ffffffffffff));
+  x24 = (x22 + x10);
+  x25 = (fiat_25519_uint1)(x24 >> 51);
+  x26 = (x24 & UINT64_C(0x7ffffffffffff));
+  x27 = (x25 + x13);
   out1[0] = x23;
   out1[1] = x26;
   out1[2] = x27;
   out1[3] = x16;
   out1[4] = x19;
 }
-
diff --git a/third_party/fiat/p256_32.h b/third_party/fiat/p256_32.h
index 504da42..3812d8c 100644
--- a/third_party/fiat/p256_32.h
+++ b/third_party/fiat/p256_32.h
@@ -1,8 +1,8 @@
-/* Autogenerated: src/ExtractionOCaml/word_by_word_montgomery --static p256 '2^256 - 2^224 + 2^192 + 2^96 - 1' 32 mul square add sub opp from_montgomery nonzero selectznz to_bytes from_bytes */
+/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 32 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */
 /* curve description: p256 */
-/* requested operations: mul, square, add, sub, opp, from_montgomery, nonzero, selectznz, to_bytes, from_bytes */
-/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */
 /* machine_wordsize = 32 (from "32") */
+/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */
+/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */
 /*                                                                    */
 /* NOTE: In addition to the bounds specified above each function, all */
 /*   functions synthesized for this Montgomery arithmetic require the */
@@ -10,18 +10,47 @@
 /*   require the input to be in the unique saturated representation.  */
 /*   All functions also ensure that these two properties are true of  */
 /*   return values.                                                   */
+/*  */
+/* Computed values: */
+/*   eval z = z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   twos_complement_eval z = let x1 := z[0] + (z[1] << 32) + (z[2] << 64) + (z[3] << 96) + (z[4] << 128) + (z[5] << 160) + (z[6] << 192) + (z[7] << 224) in */
+/*                            if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */
 
 #include <stdint.h>
 typedef unsigned char fiat_p256_uint1;
 typedef signed char fiat_p256_int1;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_P256_FIAT_INLINE __inline__
+#else
+#  define FIAT_P256_FIAT_INLINE
+#endif
+
+/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */
+typedef uint32_t fiat_p256_montgomery_domain_field_element[8];
+
+/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]] */
+typedef uint32_t fiat_p256_non_montgomery_domain_field_element[8];
 
 #if (-1 & 3) != 3
 #error "This code only works on a two's complement system"
 #endif
 
+#if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint32_t fiat_p256_value_barrier_u32(uint32_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_p256_value_barrier_u32(x) (x)
+#endif
+
 
 /*
  * The function fiat_p256_addcarryx_u32 is an addition with carry.
+ *
  * Postconditions:
  *   out1 = (arg1 + arg2 + arg3) mod 2^32
  *   out2 = ⌊(arg1 + arg2 + arg3) / 2^32⌋
@@ -34,16 +63,20 @@
  *   out1: [0x0 ~> 0xffffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_p256_addcarryx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  uint64_t x1 = ((arg1 + (uint64_t)arg2) + arg3);
-  uint32_t x2 = (uint32_t)(x1 & UINT32_C(0xffffffff));
-  fiat_p256_uint1 x3 = (fiat_p256_uint1)(x1 >> 32);
+static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  uint64_t x1;
+  uint32_t x2;
+  fiat_p256_uint1 x3;
+  x1 = ((arg1 + (uint64_t)arg2) + arg3);
+  x2 = (uint32_t)(x1 & UINT32_C(0xffffffff));
+  x3 = (fiat_p256_uint1)(x1 >> 32);
   *out1 = x2;
   *out2 = x3;
 }
 
 /*
  * The function fiat_p256_subborrowx_u32 is a subtraction with borrow.
+ *
  * Postconditions:
  *   out1 = (-arg1 + arg2 + -arg3) mod 2^32
  *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^32⌋
@@ -56,16 +89,20 @@
  *   out1: [0x0 ~> 0xffffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_p256_subborrowx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  int64_t x1 = ((arg2 - (int64_t)arg1) - arg3);
-  fiat_p256_int1 x2 = (fiat_p256_int1)(x1 >> 32);
-  uint32_t x3 = (uint32_t)(x1 & UINT32_C(0xffffffff));
+static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u32(uint32_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  int64_t x1;
+  fiat_p256_int1 x2;
+  uint32_t x3;
+  x1 = ((arg2 - (int64_t)arg1) - arg3);
+  x2 = (fiat_p256_int1)(x1 >> 32);
+  x3 = (uint32_t)(x1 & UINT32_C(0xffffffff));
   *out1 = x3;
   *out2 = (fiat_p256_uint1)(0x0 - x2);
 }
 
 /*
  * The function fiat_p256_mulx_u32 is a multiplication, returning the full double-width result.
+ *
  * Postconditions:
  *   out1 = (arg1 * arg2) mod 2^32
  *   out2 = ⌊arg1 * arg2 / 2^32⌋
@@ -77,16 +114,20 @@
  *   out1: [0x0 ~> 0xffffffff]
  *   out2: [0x0 ~> 0xffffffff]
  */
-static void fiat_p256_mulx_u32(uint32_t* out1, uint32_t* out2, uint32_t arg1, uint32_t arg2) {
-  uint64_t x1 = ((uint64_t)arg1 * arg2);
-  uint32_t x2 = (uint32_t)(x1 & UINT32_C(0xffffffff));
-  uint32_t x3 = (uint32_t)(x1 >> 32);
+static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u32(uint32_t* out1, uint32_t* out2, uint32_t arg1, uint32_t arg2) {
+  uint64_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  x1 = ((uint64_t)arg1 * arg2);
+  x2 = (uint32_t)(x1 & UINT32_C(0xffffffff));
+  x3 = (uint32_t)(x1 >> 32);
   *out1 = x2;
   *out2 = x3;
 }
 
 /*
  * The function fiat_p256_cmovznz_u32 is a single-word conditional move.
+ *
  * Postconditions:
  *   out1 = (if arg1 = 0 then arg2 else arg3)
  *
@@ -97,21 +138,19 @@
  * Output Bounds:
  *   out1: [0x0 ~> 0xffffffff]
  */
-static void fiat_p256_cmovznz_u32(uint32_t* out1, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
-  fiat_p256_uint1 x1 = (!(!arg1));
-  uint32_t x2 = ((fiat_p256_int1)(0x0 - x1) & UINT32_C(0xffffffff));
-  // Note this line has been patched from the synthesized code to add value
-  // barriers.
-  //
-  // Clang recognizes this pattern as a select. While it usually transforms it
-  // to a cmov, it sometimes further transforms it into a branch, which we do
-  // not want.
-  uint32_t x3 = ((value_barrier_u32(x2) & arg3) | (value_barrier_u32(~x2) & arg2));
+static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u32(uint32_t* out1, fiat_p256_uint1 arg1, uint32_t arg2, uint32_t arg3) {
+  fiat_p256_uint1 x1;
+  uint32_t x2;
+  uint32_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_p256_int1)(0x0 - x1) & UINT32_C(0xffffffff));
+  x3 = ((fiat_p256_value_barrier_u32(x2) & arg3) | (fiat_p256_value_barrier_u32((~x2)) & arg2));
   *out1 = x3;
 }
 
 /*
  * The function fiat_p256_mul multiplies two field elements in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  *   0 ≤ eval arg2 < m
@@ -119,995 +158,1021 @@
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- *   arg2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_mul(uint32_t out1[8], const uint32_t arg1[8], const uint32_t arg2[8]) {
-  uint32_t x1 = (arg1[1]);
-  uint32_t x2 = (arg1[2]);
-  uint32_t x3 = (arg1[3]);
-  uint32_t x4 = (arg1[4]);
-  uint32_t x5 = (arg1[5]);
-  uint32_t x6 = (arg1[6]);
-  uint32_t x7 = (arg1[7]);
-  uint32_t x8 = (arg1[0]);
+static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
   uint32_t x9;
   uint32_t x10;
-  fiat_p256_mulx_u32(&x9, &x10, x8, (arg2[7]));
   uint32_t x11;
   uint32_t x12;
-  fiat_p256_mulx_u32(&x11, &x12, x8, (arg2[6]));
   uint32_t x13;
   uint32_t x14;
-  fiat_p256_mulx_u32(&x13, &x14, x8, (arg2[5]));
   uint32_t x15;
   uint32_t x16;
-  fiat_p256_mulx_u32(&x15, &x16, x8, (arg2[4]));
   uint32_t x17;
   uint32_t x18;
-  fiat_p256_mulx_u32(&x17, &x18, x8, (arg2[3]));
   uint32_t x19;
   uint32_t x20;
-  fiat_p256_mulx_u32(&x19, &x20, x8, (arg2[2]));
   uint32_t x21;
   uint32_t x22;
-  fiat_p256_mulx_u32(&x21, &x22, x8, (arg2[1]));
   uint32_t x23;
   uint32_t x24;
-  fiat_p256_mulx_u32(&x23, &x24, x8, (arg2[0]));
   uint32_t x25;
   fiat_p256_uint1 x26;
-  fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21);
   uint32_t x27;
   fiat_p256_uint1 x28;
-  fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19);
   uint32_t x29;
   fiat_p256_uint1 x30;
-  fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17);
   uint32_t x31;
   fiat_p256_uint1 x32;
-  fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15);
   uint32_t x33;
   fiat_p256_uint1 x34;
-  fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13);
   uint32_t x35;
   fiat_p256_uint1 x36;
-  fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11);
   uint32_t x37;
   fiat_p256_uint1 x38;
-  fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9);
-  uint32_t x39 = (x38 + x10);
+  uint32_t x39;
   uint32_t x40;
   uint32_t x41;
-  fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff));
   uint32_t x42;
   uint32_t x43;
-  fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff));
   uint32_t x44;
   uint32_t x45;
-  fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff));
   uint32_t x46;
   uint32_t x47;
-  fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff));
   uint32_t x48;
   fiat_p256_uint1 x49;
-  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44);
   uint32_t x50;
   fiat_p256_uint1 x51;
-  fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42);
-  uint32_t x52 = (x51 + x43);
+  uint32_t x52;
   uint32_t x53;
   fiat_p256_uint1 x54;
-  fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46);
   uint32_t x55;
   fiat_p256_uint1 x56;
-  fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48);
   uint32_t x57;
   fiat_p256_uint1 x58;
-  fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50);
   uint32_t x59;
   fiat_p256_uint1 x60;
-  fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52);
   uint32_t x61;
   fiat_p256_uint1 x62;
-  fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0);
   uint32_t x63;
   fiat_p256_uint1 x64;
-  fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0);
   uint32_t x65;
   fiat_p256_uint1 x66;
-  fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23);
   uint32_t x67;
   fiat_p256_uint1 x68;
-  fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40);
   uint32_t x69;
   fiat_p256_uint1 x70;
-  fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41);
   uint32_t x71;
   uint32_t x72;
-  fiat_p256_mulx_u32(&x71, &x72, x1, (arg2[7]));
   uint32_t x73;
   uint32_t x74;
-  fiat_p256_mulx_u32(&x73, &x74, x1, (arg2[6]));
   uint32_t x75;
   uint32_t x76;
-  fiat_p256_mulx_u32(&x75, &x76, x1, (arg2[5]));
   uint32_t x77;
   uint32_t x78;
-  fiat_p256_mulx_u32(&x77, &x78, x1, (arg2[4]));
   uint32_t x79;
   uint32_t x80;
-  fiat_p256_mulx_u32(&x79, &x80, x1, (arg2[3]));
   uint32_t x81;
   uint32_t x82;
-  fiat_p256_mulx_u32(&x81, &x82, x1, (arg2[2]));
   uint32_t x83;
   uint32_t x84;
-  fiat_p256_mulx_u32(&x83, &x84, x1, (arg2[1]));
   uint32_t x85;
   uint32_t x86;
-  fiat_p256_mulx_u32(&x85, &x86, x1, (arg2[0]));
   uint32_t x87;
   fiat_p256_uint1 x88;
-  fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83);
   uint32_t x89;
   fiat_p256_uint1 x90;
-  fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81);
   uint32_t x91;
   fiat_p256_uint1 x92;
-  fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79);
   uint32_t x93;
   fiat_p256_uint1 x94;
-  fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77);
   uint32_t x95;
   fiat_p256_uint1 x96;
-  fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75);
   uint32_t x97;
   fiat_p256_uint1 x98;
-  fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73);
   uint32_t x99;
   fiat_p256_uint1 x100;
-  fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71);
-  uint32_t x101 = (x100 + x72);
+  uint32_t x101;
   uint32_t x102;
   fiat_p256_uint1 x103;
-  fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85);
   uint32_t x104;
   fiat_p256_uint1 x105;
-  fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87);
   uint32_t x106;
   fiat_p256_uint1 x107;
-  fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89);
   uint32_t x108;
   fiat_p256_uint1 x109;
-  fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91);
   uint32_t x110;
   fiat_p256_uint1 x111;
-  fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93);
   uint32_t x112;
   fiat_p256_uint1 x113;
-  fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95);
   uint32_t x114;
   fiat_p256_uint1 x115;
-  fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97);
   uint32_t x116;
   fiat_p256_uint1 x117;
-  fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99);
   uint32_t x118;
   fiat_p256_uint1 x119;
-  fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101);
   uint32_t x120;
   uint32_t x121;
-  fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff));
   uint32_t x122;
   uint32_t x123;
-  fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff));
   uint32_t x124;
   uint32_t x125;
-  fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff));
   uint32_t x126;
   uint32_t x127;
-  fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff));
   uint32_t x128;
   fiat_p256_uint1 x129;
-  fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124);
   uint32_t x130;
   fiat_p256_uint1 x131;
-  fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122);
-  uint32_t x132 = (x131 + x123);
+  uint32_t x132;
   uint32_t x133;
   fiat_p256_uint1 x134;
-  fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126);
   uint32_t x135;
   fiat_p256_uint1 x136;
-  fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128);
   uint32_t x137;
   fiat_p256_uint1 x138;
-  fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130);
   uint32_t x139;
   fiat_p256_uint1 x140;
-  fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132);
   uint32_t x141;
   fiat_p256_uint1 x142;
-  fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0);
   uint32_t x143;
   fiat_p256_uint1 x144;
-  fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0);
   uint32_t x145;
   fiat_p256_uint1 x146;
-  fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102);
   uint32_t x147;
   fiat_p256_uint1 x148;
-  fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120);
   uint32_t x149;
   fiat_p256_uint1 x150;
-  fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121);
-  uint32_t x151 = ((uint32_t)x150 + x119);
+  uint32_t x151;
   uint32_t x152;
   uint32_t x153;
-  fiat_p256_mulx_u32(&x152, &x153, x2, (arg2[7]));
   uint32_t x154;
   uint32_t x155;
-  fiat_p256_mulx_u32(&x154, &x155, x2, (arg2[6]));
   uint32_t x156;
   uint32_t x157;
-  fiat_p256_mulx_u32(&x156, &x157, x2, (arg2[5]));
   uint32_t x158;
   uint32_t x159;
-  fiat_p256_mulx_u32(&x158, &x159, x2, (arg2[4]));
   uint32_t x160;
   uint32_t x161;
-  fiat_p256_mulx_u32(&x160, &x161, x2, (arg2[3]));
   uint32_t x162;
   uint32_t x163;
-  fiat_p256_mulx_u32(&x162, &x163, x2, (arg2[2]));
   uint32_t x164;
   uint32_t x165;
-  fiat_p256_mulx_u32(&x164, &x165, x2, (arg2[1]));
   uint32_t x166;
   uint32_t x167;
-  fiat_p256_mulx_u32(&x166, &x167, x2, (arg2[0]));
   uint32_t x168;
   fiat_p256_uint1 x169;
-  fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164);
   uint32_t x170;
   fiat_p256_uint1 x171;
-  fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162);
   uint32_t x172;
   fiat_p256_uint1 x173;
-  fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160);
   uint32_t x174;
   fiat_p256_uint1 x175;
-  fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158);
   uint32_t x176;
   fiat_p256_uint1 x177;
-  fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156);
   uint32_t x178;
   fiat_p256_uint1 x179;
-  fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154);
   uint32_t x180;
   fiat_p256_uint1 x181;
-  fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152);
-  uint32_t x182 = (x181 + x153);
+  uint32_t x182;
   uint32_t x183;
   fiat_p256_uint1 x184;
-  fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166);
   uint32_t x185;
   fiat_p256_uint1 x186;
-  fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168);
   uint32_t x187;
   fiat_p256_uint1 x188;
-  fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170);
   uint32_t x189;
   fiat_p256_uint1 x190;
-  fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172);
   uint32_t x191;
   fiat_p256_uint1 x192;
-  fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174);
   uint32_t x193;
   fiat_p256_uint1 x194;
-  fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176);
   uint32_t x195;
   fiat_p256_uint1 x196;
-  fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178);
   uint32_t x197;
   fiat_p256_uint1 x198;
-  fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180);
   uint32_t x199;
   fiat_p256_uint1 x200;
-  fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182);
   uint32_t x201;
   uint32_t x202;
-  fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff));
   uint32_t x203;
   uint32_t x204;
-  fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff));
   uint32_t x205;
   uint32_t x206;
-  fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff));
   uint32_t x207;
   uint32_t x208;
-  fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff));
   uint32_t x209;
   fiat_p256_uint1 x210;
-  fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205);
   uint32_t x211;
   fiat_p256_uint1 x212;
-  fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203);
-  uint32_t x213 = (x212 + x204);
+  uint32_t x213;
   uint32_t x214;
   fiat_p256_uint1 x215;
-  fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207);
   uint32_t x216;
   fiat_p256_uint1 x217;
-  fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209);
   uint32_t x218;
   fiat_p256_uint1 x219;
-  fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211);
   uint32_t x220;
   fiat_p256_uint1 x221;
-  fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213);
   uint32_t x222;
   fiat_p256_uint1 x223;
-  fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0);
   uint32_t x224;
   fiat_p256_uint1 x225;
-  fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0);
   uint32_t x226;
   fiat_p256_uint1 x227;
-  fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183);
   uint32_t x228;
   fiat_p256_uint1 x229;
-  fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201);
   uint32_t x230;
   fiat_p256_uint1 x231;
-  fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202);
-  uint32_t x232 = ((uint32_t)x231 + x200);
+  uint32_t x232;
   uint32_t x233;
   uint32_t x234;
-  fiat_p256_mulx_u32(&x233, &x234, x3, (arg2[7]));
   uint32_t x235;
   uint32_t x236;
-  fiat_p256_mulx_u32(&x235, &x236, x3, (arg2[6]));
   uint32_t x237;
   uint32_t x238;
-  fiat_p256_mulx_u32(&x237, &x238, x3, (arg2[5]));
   uint32_t x239;
   uint32_t x240;
-  fiat_p256_mulx_u32(&x239, &x240, x3, (arg2[4]));
   uint32_t x241;
   uint32_t x242;
-  fiat_p256_mulx_u32(&x241, &x242, x3, (arg2[3]));
   uint32_t x243;
   uint32_t x244;
-  fiat_p256_mulx_u32(&x243, &x244, x3, (arg2[2]));
   uint32_t x245;
   uint32_t x246;
-  fiat_p256_mulx_u32(&x245, &x246, x3, (arg2[1]));
   uint32_t x247;
   uint32_t x248;
-  fiat_p256_mulx_u32(&x247, &x248, x3, (arg2[0]));
   uint32_t x249;
   fiat_p256_uint1 x250;
-  fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245);
   uint32_t x251;
   fiat_p256_uint1 x252;
-  fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243);
   uint32_t x253;
   fiat_p256_uint1 x254;
-  fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241);
   uint32_t x255;
   fiat_p256_uint1 x256;
-  fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239);
   uint32_t x257;
   fiat_p256_uint1 x258;
-  fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237);
   uint32_t x259;
   fiat_p256_uint1 x260;
-  fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235);
   uint32_t x261;
   fiat_p256_uint1 x262;
-  fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233);
-  uint32_t x263 = (x262 + x234);
+  uint32_t x263;
   uint32_t x264;
   fiat_p256_uint1 x265;
-  fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247);
   uint32_t x266;
   fiat_p256_uint1 x267;
-  fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249);
   uint32_t x268;
   fiat_p256_uint1 x269;
-  fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251);
   uint32_t x270;
   fiat_p256_uint1 x271;
-  fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253);
   uint32_t x272;
   fiat_p256_uint1 x273;
-  fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255);
   uint32_t x274;
   fiat_p256_uint1 x275;
-  fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257);
   uint32_t x276;
   fiat_p256_uint1 x277;
-  fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259);
   uint32_t x278;
   fiat_p256_uint1 x279;
-  fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261);
   uint32_t x280;
   fiat_p256_uint1 x281;
-  fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263);
   uint32_t x282;
   uint32_t x283;
-  fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff));
   uint32_t x284;
   uint32_t x285;
-  fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff));
   uint32_t x286;
   uint32_t x287;
-  fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff));
   uint32_t x288;
   uint32_t x289;
-  fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff));
   uint32_t x290;
   fiat_p256_uint1 x291;
-  fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286);
   uint32_t x292;
   fiat_p256_uint1 x293;
-  fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284);
-  uint32_t x294 = (x293 + x285);
+  uint32_t x294;
   uint32_t x295;
   fiat_p256_uint1 x296;
-  fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288);
   uint32_t x297;
   fiat_p256_uint1 x298;
-  fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290);
   uint32_t x299;
   fiat_p256_uint1 x300;
-  fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292);
   uint32_t x301;
   fiat_p256_uint1 x302;
-  fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294);
   uint32_t x303;
   fiat_p256_uint1 x304;
-  fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0);
   uint32_t x305;
   fiat_p256_uint1 x306;
-  fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0);
   uint32_t x307;
   fiat_p256_uint1 x308;
-  fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264);
   uint32_t x309;
   fiat_p256_uint1 x310;
-  fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282);
   uint32_t x311;
   fiat_p256_uint1 x312;
-  fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283);
-  uint32_t x313 = ((uint32_t)x312 + x281);
+  uint32_t x313;
   uint32_t x314;
   uint32_t x315;
-  fiat_p256_mulx_u32(&x314, &x315, x4, (arg2[7]));
   uint32_t x316;
   uint32_t x317;
-  fiat_p256_mulx_u32(&x316, &x317, x4, (arg2[6]));
   uint32_t x318;
   uint32_t x319;
-  fiat_p256_mulx_u32(&x318, &x319, x4, (arg2[5]));
   uint32_t x320;
   uint32_t x321;
-  fiat_p256_mulx_u32(&x320, &x321, x4, (arg2[4]));
   uint32_t x322;
   uint32_t x323;
-  fiat_p256_mulx_u32(&x322, &x323, x4, (arg2[3]));
   uint32_t x324;
   uint32_t x325;
-  fiat_p256_mulx_u32(&x324, &x325, x4, (arg2[2]));
   uint32_t x326;
   uint32_t x327;
-  fiat_p256_mulx_u32(&x326, &x327, x4, (arg2[1]));
   uint32_t x328;
   uint32_t x329;
-  fiat_p256_mulx_u32(&x328, &x329, x4, (arg2[0]));
   uint32_t x330;
   fiat_p256_uint1 x331;
-  fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326);
   uint32_t x332;
   fiat_p256_uint1 x333;
-  fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324);
   uint32_t x334;
   fiat_p256_uint1 x335;
-  fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322);
   uint32_t x336;
   fiat_p256_uint1 x337;
-  fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320);
   uint32_t x338;
   fiat_p256_uint1 x339;
-  fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318);
   uint32_t x340;
   fiat_p256_uint1 x341;
-  fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316);
   uint32_t x342;
   fiat_p256_uint1 x343;
-  fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314);
-  uint32_t x344 = (x343 + x315);
+  uint32_t x344;
   uint32_t x345;
   fiat_p256_uint1 x346;
-  fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328);
   uint32_t x347;
   fiat_p256_uint1 x348;
-  fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330);
   uint32_t x349;
   fiat_p256_uint1 x350;
-  fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332);
   uint32_t x351;
   fiat_p256_uint1 x352;
-  fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334);
   uint32_t x353;
   fiat_p256_uint1 x354;
-  fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336);
   uint32_t x355;
   fiat_p256_uint1 x356;
-  fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338);
   uint32_t x357;
   fiat_p256_uint1 x358;
-  fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340);
   uint32_t x359;
   fiat_p256_uint1 x360;
-  fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342);
   uint32_t x361;
   fiat_p256_uint1 x362;
-  fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344);
   uint32_t x363;
   uint32_t x364;
-  fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff));
   uint32_t x365;
   uint32_t x366;
-  fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff));
   uint32_t x367;
   uint32_t x368;
-  fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff));
   uint32_t x369;
   uint32_t x370;
-  fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff));
   uint32_t x371;
   fiat_p256_uint1 x372;
-  fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367);
   uint32_t x373;
   fiat_p256_uint1 x374;
-  fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365);
-  uint32_t x375 = (x374 + x366);
+  uint32_t x375;
   uint32_t x376;
   fiat_p256_uint1 x377;
-  fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369);
   uint32_t x378;
   fiat_p256_uint1 x379;
-  fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371);
   uint32_t x380;
   fiat_p256_uint1 x381;
-  fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373);
   uint32_t x382;
   fiat_p256_uint1 x383;
-  fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375);
   uint32_t x384;
   fiat_p256_uint1 x385;
-  fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0);
   uint32_t x386;
   fiat_p256_uint1 x387;
-  fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0);
   uint32_t x388;
   fiat_p256_uint1 x389;
-  fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345);
   uint32_t x390;
   fiat_p256_uint1 x391;
-  fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363);
   uint32_t x392;
   fiat_p256_uint1 x393;
-  fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364);
-  uint32_t x394 = ((uint32_t)x393 + x362);
+  uint32_t x394;
   uint32_t x395;
   uint32_t x396;
-  fiat_p256_mulx_u32(&x395, &x396, x5, (arg2[7]));
   uint32_t x397;
   uint32_t x398;
-  fiat_p256_mulx_u32(&x397, &x398, x5, (arg2[6]));
   uint32_t x399;
   uint32_t x400;
-  fiat_p256_mulx_u32(&x399, &x400, x5, (arg2[5]));
   uint32_t x401;
   uint32_t x402;
-  fiat_p256_mulx_u32(&x401, &x402, x5, (arg2[4]));
   uint32_t x403;
   uint32_t x404;
-  fiat_p256_mulx_u32(&x403, &x404, x5, (arg2[3]));
   uint32_t x405;
   uint32_t x406;
-  fiat_p256_mulx_u32(&x405, &x406, x5, (arg2[2]));
   uint32_t x407;
   uint32_t x408;
-  fiat_p256_mulx_u32(&x407, &x408, x5, (arg2[1]));
   uint32_t x409;
   uint32_t x410;
-  fiat_p256_mulx_u32(&x409, &x410, x5, (arg2[0]));
   uint32_t x411;
   fiat_p256_uint1 x412;
-  fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407);
   uint32_t x413;
   fiat_p256_uint1 x414;
-  fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405);
   uint32_t x415;
   fiat_p256_uint1 x416;
-  fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403);
   uint32_t x417;
   fiat_p256_uint1 x418;
-  fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401);
   uint32_t x419;
   fiat_p256_uint1 x420;
-  fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399);
   uint32_t x421;
   fiat_p256_uint1 x422;
-  fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397);
   uint32_t x423;
   fiat_p256_uint1 x424;
-  fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395);
-  uint32_t x425 = (x424 + x396);
+  uint32_t x425;
   uint32_t x426;
   fiat_p256_uint1 x427;
-  fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409);
   uint32_t x428;
   fiat_p256_uint1 x429;
-  fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411);
   uint32_t x430;
   fiat_p256_uint1 x431;
-  fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413);
   uint32_t x432;
   fiat_p256_uint1 x433;
-  fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415);
   uint32_t x434;
   fiat_p256_uint1 x435;
-  fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417);
   uint32_t x436;
   fiat_p256_uint1 x437;
-  fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419);
   uint32_t x438;
   fiat_p256_uint1 x439;
-  fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421);
   uint32_t x440;
   fiat_p256_uint1 x441;
-  fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423);
   uint32_t x442;
   fiat_p256_uint1 x443;
-  fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425);
   uint32_t x444;
   uint32_t x445;
-  fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff));
   uint32_t x446;
   uint32_t x447;
-  fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff));
   uint32_t x448;
   uint32_t x449;
-  fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff));
   uint32_t x450;
   uint32_t x451;
-  fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff));
   uint32_t x452;
   fiat_p256_uint1 x453;
-  fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448);
   uint32_t x454;
   fiat_p256_uint1 x455;
-  fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446);
-  uint32_t x456 = (x455 + x447);
+  uint32_t x456;
   uint32_t x457;
   fiat_p256_uint1 x458;
-  fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450);
   uint32_t x459;
   fiat_p256_uint1 x460;
-  fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452);
   uint32_t x461;
   fiat_p256_uint1 x462;
-  fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454);
   uint32_t x463;
   fiat_p256_uint1 x464;
-  fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456);
   uint32_t x465;
   fiat_p256_uint1 x466;
-  fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0);
   uint32_t x467;
   fiat_p256_uint1 x468;
-  fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0);
   uint32_t x469;
   fiat_p256_uint1 x470;
-  fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426);
   uint32_t x471;
   fiat_p256_uint1 x472;
-  fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444);
   uint32_t x473;
   fiat_p256_uint1 x474;
-  fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445);
-  uint32_t x475 = ((uint32_t)x474 + x443);
+  uint32_t x475;
   uint32_t x476;
   uint32_t x477;
-  fiat_p256_mulx_u32(&x476, &x477, x6, (arg2[7]));
   uint32_t x478;
   uint32_t x479;
-  fiat_p256_mulx_u32(&x478, &x479, x6, (arg2[6]));
   uint32_t x480;
   uint32_t x481;
-  fiat_p256_mulx_u32(&x480, &x481, x6, (arg2[5]));
   uint32_t x482;
   uint32_t x483;
-  fiat_p256_mulx_u32(&x482, &x483, x6, (arg2[4]));
   uint32_t x484;
   uint32_t x485;
-  fiat_p256_mulx_u32(&x484, &x485, x6, (arg2[3]));
   uint32_t x486;
   uint32_t x487;
-  fiat_p256_mulx_u32(&x486, &x487, x6, (arg2[2]));
   uint32_t x488;
   uint32_t x489;
-  fiat_p256_mulx_u32(&x488, &x489, x6, (arg2[1]));
   uint32_t x490;
   uint32_t x491;
-  fiat_p256_mulx_u32(&x490, &x491, x6, (arg2[0]));
   uint32_t x492;
   fiat_p256_uint1 x493;
-  fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488);
   uint32_t x494;
   fiat_p256_uint1 x495;
-  fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486);
   uint32_t x496;
   fiat_p256_uint1 x497;
-  fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484);
   uint32_t x498;
   fiat_p256_uint1 x499;
-  fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482);
   uint32_t x500;
   fiat_p256_uint1 x501;
-  fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480);
   uint32_t x502;
   fiat_p256_uint1 x503;
-  fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478);
   uint32_t x504;
   fiat_p256_uint1 x505;
-  fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476);
-  uint32_t x506 = (x505 + x477);
+  uint32_t x506;
   uint32_t x507;
   fiat_p256_uint1 x508;
-  fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490);
   uint32_t x509;
   fiat_p256_uint1 x510;
-  fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492);
   uint32_t x511;
   fiat_p256_uint1 x512;
-  fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494);
   uint32_t x513;
   fiat_p256_uint1 x514;
-  fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496);
   uint32_t x515;
   fiat_p256_uint1 x516;
-  fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498);
   uint32_t x517;
   fiat_p256_uint1 x518;
-  fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500);
   uint32_t x519;
   fiat_p256_uint1 x520;
-  fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502);
   uint32_t x521;
   fiat_p256_uint1 x522;
-  fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504);
   uint32_t x523;
   fiat_p256_uint1 x524;
-  fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506);
   uint32_t x525;
   uint32_t x526;
-  fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff));
   uint32_t x527;
   uint32_t x528;
-  fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff));
   uint32_t x529;
   uint32_t x530;
-  fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff));
   uint32_t x531;
   uint32_t x532;
-  fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff));
   uint32_t x533;
   fiat_p256_uint1 x534;
-  fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529);
   uint32_t x535;
   fiat_p256_uint1 x536;
-  fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527);
-  uint32_t x537 = (x536 + x528);
+  uint32_t x537;
   uint32_t x538;
   fiat_p256_uint1 x539;
-  fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531);
   uint32_t x540;
   fiat_p256_uint1 x541;
-  fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533);
   uint32_t x542;
   fiat_p256_uint1 x543;
-  fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535);
   uint32_t x544;
   fiat_p256_uint1 x545;
-  fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537);
   uint32_t x546;
   fiat_p256_uint1 x547;
-  fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0);
   uint32_t x548;
   fiat_p256_uint1 x549;
-  fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0);
   uint32_t x550;
   fiat_p256_uint1 x551;
-  fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507);
   uint32_t x552;
   fiat_p256_uint1 x553;
-  fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525);
   uint32_t x554;
   fiat_p256_uint1 x555;
-  fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526);
-  uint32_t x556 = ((uint32_t)x555 + x524);
+  uint32_t x556;
   uint32_t x557;
   uint32_t x558;
-  fiat_p256_mulx_u32(&x557, &x558, x7, (arg2[7]));
   uint32_t x559;
   uint32_t x560;
-  fiat_p256_mulx_u32(&x559, &x560, x7, (arg2[6]));
   uint32_t x561;
   uint32_t x562;
-  fiat_p256_mulx_u32(&x561, &x562, x7, (arg2[5]));
   uint32_t x563;
   uint32_t x564;
-  fiat_p256_mulx_u32(&x563, &x564, x7, (arg2[4]));
   uint32_t x565;
   uint32_t x566;
-  fiat_p256_mulx_u32(&x565, &x566, x7, (arg2[3]));
   uint32_t x567;
   uint32_t x568;
-  fiat_p256_mulx_u32(&x567, &x568, x7, (arg2[2]));
   uint32_t x569;
   uint32_t x570;
-  fiat_p256_mulx_u32(&x569, &x570, x7, (arg2[1]));
   uint32_t x571;
   uint32_t x572;
-  fiat_p256_mulx_u32(&x571, &x572, x7, (arg2[0]));
   uint32_t x573;
   fiat_p256_uint1 x574;
-  fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569);
   uint32_t x575;
   fiat_p256_uint1 x576;
-  fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567);
   uint32_t x577;
   fiat_p256_uint1 x578;
-  fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565);
   uint32_t x579;
   fiat_p256_uint1 x580;
-  fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563);
   uint32_t x581;
   fiat_p256_uint1 x582;
-  fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561);
   uint32_t x583;
   fiat_p256_uint1 x584;
-  fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559);
   uint32_t x585;
   fiat_p256_uint1 x586;
-  fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557);
-  uint32_t x587 = (x586 + x558);
+  uint32_t x587;
   uint32_t x588;
   fiat_p256_uint1 x589;
-  fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571);
   uint32_t x590;
   fiat_p256_uint1 x591;
-  fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573);
   uint32_t x592;
   fiat_p256_uint1 x593;
-  fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575);
   uint32_t x594;
   fiat_p256_uint1 x595;
-  fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577);
   uint32_t x596;
   fiat_p256_uint1 x597;
-  fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579);
   uint32_t x598;
   fiat_p256_uint1 x599;
-  fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581);
   uint32_t x600;
   fiat_p256_uint1 x601;
-  fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583);
   uint32_t x602;
   fiat_p256_uint1 x603;
-  fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585);
   uint32_t x604;
   fiat_p256_uint1 x605;
-  fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587);
   uint32_t x606;
   uint32_t x607;
-  fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff));
   uint32_t x608;
   uint32_t x609;
-  fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff));
   uint32_t x610;
   uint32_t x611;
-  fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff));
   uint32_t x612;
   uint32_t x613;
-  fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff));
   uint32_t x614;
   fiat_p256_uint1 x615;
-  fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610);
   uint32_t x616;
   fiat_p256_uint1 x617;
-  fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608);
-  uint32_t x618 = (x617 + x609);
+  uint32_t x618;
   uint32_t x619;
   fiat_p256_uint1 x620;
-  fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612);
   uint32_t x621;
   fiat_p256_uint1 x622;
-  fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614);
   uint32_t x623;
   fiat_p256_uint1 x624;
-  fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616);
   uint32_t x625;
   fiat_p256_uint1 x626;
-  fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618);
   uint32_t x627;
   fiat_p256_uint1 x628;
-  fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0);
   uint32_t x629;
   fiat_p256_uint1 x630;
-  fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0);
   uint32_t x631;
   fiat_p256_uint1 x632;
-  fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588);
   uint32_t x633;
   fiat_p256_uint1 x634;
-  fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606);
   uint32_t x635;
   fiat_p256_uint1 x636;
-  fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607);
-  uint32_t x637 = ((uint32_t)x636 + x605);
+  uint32_t x637;
   uint32_t x638;
   fiat_p256_uint1 x639;
-  fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff));
   uint32_t x640;
   fiat_p256_uint1 x641;
-  fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff));
   uint32_t x642;
   fiat_p256_uint1 x643;
-  fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff));
   uint32_t x644;
   fiat_p256_uint1 x645;
-  fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0);
   uint32_t x646;
   fiat_p256_uint1 x647;
-  fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0);
   uint32_t x648;
   fiat_p256_uint1 x649;
-  fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0);
   uint32_t x650;
   fiat_p256_uint1 x651;
-  fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1);
   uint32_t x652;
   fiat_p256_uint1 x653;
-  fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff));
   uint32_t x654;
   fiat_p256_uint1 x655;
-  fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0);
   uint32_t x656;
-  fiat_p256_cmovznz_u32(&x656, x655, x638, x621);
   uint32_t x657;
-  fiat_p256_cmovznz_u32(&x657, x655, x640, x623);
   uint32_t x658;
-  fiat_p256_cmovznz_u32(&x658, x655, x642, x625);
   uint32_t x659;
-  fiat_p256_cmovznz_u32(&x659, x655, x644, x627);
   uint32_t x660;
-  fiat_p256_cmovznz_u32(&x660, x655, x646, x629);
   uint32_t x661;
-  fiat_p256_cmovznz_u32(&x661, x655, x648, x631);
   uint32_t x662;
-  fiat_p256_cmovznz_u32(&x662, x655, x650, x633);
   uint32_t x663;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[4]);
+  x5 = (arg1[5]);
+  x6 = (arg1[6]);
+  x7 = (arg1[7]);
+  x8 = (arg1[0]);
+  fiat_p256_mulx_u32(&x9, &x10, x8, (arg2[7]));
+  fiat_p256_mulx_u32(&x11, &x12, x8, (arg2[6]));
+  fiat_p256_mulx_u32(&x13, &x14, x8, (arg2[5]));
+  fiat_p256_mulx_u32(&x15, &x16, x8, (arg2[4]));
+  fiat_p256_mulx_u32(&x17, &x18, x8, (arg2[3]));
+  fiat_p256_mulx_u32(&x19, &x20, x8, (arg2[2]));
+  fiat_p256_mulx_u32(&x21, &x22, x8, (arg2[1]));
+  fiat_p256_mulx_u32(&x23, &x24, x8, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21);
+  fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19);
+  fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17);
+  fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15);
+  fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13);
+  fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11);
+  fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9);
+  x39 = (x38 + x10);
+  fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44);
+  fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42);
+  x52 = (x51 + x43);
+  fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46);
+  fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48);
+  fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50);
+  fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52);
+  fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0);
+  fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0);
+  fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23);
+  fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40);
+  fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41);
+  fiat_p256_mulx_u32(&x71, &x72, x1, (arg2[7]));
+  fiat_p256_mulx_u32(&x73, &x74, x1, (arg2[6]));
+  fiat_p256_mulx_u32(&x75, &x76, x1, (arg2[5]));
+  fiat_p256_mulx_u32(&x77, &x78, x1, (arg2[4]));
+  fiat_p256_mulx_u32(&x79, &x80, x1, (arg2[3]));
+  fiat_p256_mulx_u32(&x81, &x82, x1, (arg2[2]));
+  fiat_p256_mulx_u32(&x83, &x84, x1, (arg2[1]));
+  fiat_p256_mulx_u32(&x85, &x86, x1, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83);
+  fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81);
+  fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79);
+  fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77);
+  fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75);
+  fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73);
+  fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71);
+  x101 = (x100 + x72);
+  fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85);
+  fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87);
+  fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89);
+  fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91);
+  fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93);
+  fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95);
+  fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97);
+  fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99);
+  fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101);
+  fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124);
+  fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122);
+  x132 = (x131 + x123);
+  fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126);
+  fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128);
+  fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130);
+  fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132);
+  fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0);
+  fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0);
+  fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102);
+  fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120);
+  fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121);
+  x151 = ((uint32_t)x150 + x119);
+  fiat_p256_mulx_u32(&x152, &x153, x2, (arg2[7]));
+  fiat_p256_mulx_u32(&x154, &x155, x2, (arg2[6]));
+  fiat_p256_mulx_u32(&x156, &x157, x2, (arg2[5]));
+  fiat_p256_mulx_u32(&x158, &x159, x2, (arg2[4]));
+  fiat_p256_mulx_u32(&x160, &x161, x2, (arg2[3]));
+  fiat_p256_mulx_u32(&x162, &x163, x2, (arg2[2]));
+  fiat_p256_mulx_u32(&x164, &x165, x2, (arg2[1]));
+  fiat_p256_mulx_u32(&x166, &x167, x2, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164);
+  fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162);
+  fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160);
+  fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158);
+  fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156);
+  fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154);
+  fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152);
+  x182 = (x181 + x153);
+  fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166);
+  fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168);
+  fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170);
+  fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172);
+  fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174);
+  fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176);
+  fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178);
+  fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180);
+  fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182);
+  fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205);
+  fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203);
+  x213 = (x212 + x204);
+  fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207);
+  fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209);
+  fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211);
+  fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213);
+  fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0);
+  fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0);
+  fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183);
+  fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201);
+  fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202);
+  x232 = ((uint32_t)x231 + x200);
+  fiat_p256_mulx_u32(&x233, &x234, x3, (arg2[7]));
+  fiat_p256_mulx_u32(&x235, &x236, x3, (arg2[6]));
+  fiat_p256_mulx_u32(&x237, &x238, x3, (arg2[5]));
+  fiat_p256_mulx_u32(&x239, &x240, x3, (arg2[4]));
+  fiat_p256_mulx_u32(&x241, &x242, x3, (arg2[3]));
+  fiat_p256_mulx_u32(&x243, &x244, x3, (arg2[2]));
+  fiat_p256_mulx_u32(&x245, &x246, x3, (arg2[1]));
+  fiat_p256_mulx_u32(&x247, &x248, x3, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245);
+  fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243);
+  fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241);
+  fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239);
+  fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237);
+  fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235);
+  fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233);
+  x263 = (x262 + x234);
+  fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247);
+  fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249);
+  fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251);
+  fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253);
+  fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255);
+  fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257);
+  fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259);
+  fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261);
+  fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263);
+  fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286);
+  fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284);
+  x294 = (x293 + x285);
+  fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288);
+  fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290);
+  fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292);
+  fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294);
+  fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0);
+  fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0);
+  fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264);
+  fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282);
+  fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283);
+  x313 = ((uint32_t)x312 + x281);
+  fiat_p256_mulx_u32(&x314, &x315, x4, (arg2[7]));
+  fiat_p256_mulx_u32(&x316, &x317, x4, (arg2[6]));
+  fiat_p256_mulx_u32(&x318, &x319, x4, (arg2[5]));
+  fiat_p256_mulx_u32(&x320, &x321, x4, (arg2[4]));
+  fiat_p256_mulx_u32(&x322, &x323, x4, (arg2[3]));
+  fiat_p256_mulx_u32(&x324, &x325, x4, (arg2[2]));
+  fiat_p256_mulx_u32(&x326, &x327, x4, (arg2[1]));
+  fiat_p256_mulx_u32(&x328, &x329, x4, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326);
+  fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324);
+  fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322);
+  fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320);
+  fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318);
+  fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316);
+  fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314);
+  x344 = (x343 + x315);
+  fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328);
+  fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330);
+  fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332);
+  fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334);
+  fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336);
+  fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338);
+  fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340);
+  fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342);
+  fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344);
+  fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367);
+  fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365);
+  x375 = (x374 + x366);
+  fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369);
+  fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371);
+  fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373);
+  fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375);
+  fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0);
+  fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0);
+  fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345);
+  fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363);
+  fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364);
+  x394 = ((uint32_t)x393 + x362);
+  fiat_p256_mulx_u32(&x395, &x396, x5, (arg2[7]));
+  fiat_p256_mulx_u32(&x397, &x398, x5, (arg2[6]));
+  fiat_p256_mulx_u32(&x399, &x400, x5, (arg2[5]));
+  fiat_p256_mulx_u32(&x401, &x402, x5, (arg2[4]));
+  fiat_p256_mulx_u32(&x403, &x404, x5, (arg2[3]));
+  fiat_p256_mulx_u32(&x405, &x406, x5, (arg2[2]));
+  fiat_p256_mulx_u32(&x407, &x408, x5, (arg2[1]));
+  fiat_p256_mulx_u32(&x409, &x410, x5, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407);
+  fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405);
+  fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403);
+  fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401);
+  fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399);
+  fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397);
+  fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395);
+  x425 = (x424 + x396);
+  fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409);
+  fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411);
+  fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413);
+  fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415);
+  fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417);
+  fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419);
+  fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421);
+  fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423);
+  fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425);
+  fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448);
+  fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446);
+  x456 = (x455 + x447);
+  fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450);
+  fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452);
+  fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454);
+  fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456);
+  fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0);
+  fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0);
+  fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426);
+  fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444);
+  fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445);
+  x475 = ((uint32_t)x474 + x443);
+  fiat_p256_mulx_u32(&x476, &x477, x6, (arg2[7]));
+  fiat_p256_mulx_u32(&x478, &x479, x6, (arg2[6]));
+  fiat_p256_mulx_u32(&x480, &x481, x6, (arg2[5]));
+  fiat_p256_mulx_u32(&x482, &x483, x6, (arg2[4]));
+  fiat_p256_mulx_u32(&x484, &x485, x6, (arg2[3]));
+  fiat_p256_mulx_u32(&x486, &x487, x6, (arg2[2]));
+  fiat_p256_mulx_u32(&x488, &x489, x6, (arg2[1]));
+  fiat_p256_mulx_u32(&x490, &x491, x6, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488);
+  fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486);
+  fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484);
+  fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482);
+  fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480);
+  fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478);
+  fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476);
+  x506 = (x505 + x477);
+  fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490);
+  fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492);
+  fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494);
+  fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496);
+  fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498);
+  fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500);
+  fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502);
+  fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504);
+  fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506);
+  fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529);
+  fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527);
+  x537 = (x536 + x528);
+  fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531);
+  fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533);
+  fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535);
+  fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537);
+  fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0);
+  fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0);
+  fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507);
+  fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525);
+  fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526);
+  x556 = ((uint32_t)x555 + x524);
+  fiat_p256_mulx_u32(&x557, &x558, x7, (arg2[7]));
+  fiat_p256_mulx_u32(&x559, &x560, x7, (arg2[6]));
+  fiat_p256_mulx_u32(&x561, &x562, x7, (arg2[5]));
+  fiat_p256_mulx_u32(&x563, &x564, x7, (arg2[4]));
+  fiat_p256_mulx_u32(&x565, &x566, x7, (arg2[3]));
+  fiat_p256_mulx_u32(&x567, &x568, x7, (arg2[2]));
+  fiat_p256_mulx_u32(&x569, &x570, x7, (arg2[1]));
+  fiat_p256_mulx_u32(&x571, &x572, x7, (arg2[0]));
+  fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569);
+  fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567);
+  fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565);
+  fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563);
+  fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561);
+  fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559);
+  fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557);
+  x587 = (x586 + x558);
+  fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571);
+  fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573);
+  fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575);
+  fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577);
+  fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579);
+  fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581);
+  fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583);
+  fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585);
+  fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587);
+  fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610);
+  fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608);
+  x618 = (x617 + x609);
+  fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612);
+  fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614);
+  fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616);
+  fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618);
+  fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0);
+  fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0);
+  fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588);
+  fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606);
+  fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607);
+  x637 = ((uint32_t)x636 + x605);
+  fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0);
+  fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0);
+  fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0);
+  fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1);
+  fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0);
+  fiat_p256_cmovznz_u32(&x656, x655, x638, x621);
+  fiat_p256_cmovznz_u32(&x657, x655, x640, x623);
+  fiat_p256_cmovznz_u32(&x658, x655, x642, x625);
+  fiat_p256_cmovznz_u32(&x659, x655, x644, x627);
+  fiat_p256_cmovznz_u32(&x660, x655, x646, x629);
+  fiat_p256_cmovznz_u32(&x661, x655, x648, x631);
+  fiat_p256_cmovznz_u32(&x662, x655, x650, x633);
   fiat_p256_cmovznz_u32(&x663, x655, x652, x635);
   out1[0] = x656;
   out1[1] = x657;
@@ -1121,1000 +1186,1028 @@
 
 /*
  * The function fiat_p256_square squares a field element in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_square(uint32_t out1[8], const uint32_t arg1[8]) {
-  uint32_t x1 = (arg1[1]);
-  uint32_t x2 = (arg1[2]);
-  uint32_t x3 = (arg1[3]);
-  uint32_t x4 = (arg1[4]);
-  uint32_t x5 = (arg1[5]);
-  uint32_t x6 = (arg1[6]);
-  uint32_t x7 = (arg1[7]);
-  uint32_t x8 = (arg1[0]);
+static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
   uint32_t x9;
   uint32_t x10;
-  fiat_p256_mulx_u32(&x9, &x10, x8, (arg1[7]));
   uint32_t x11;
   uint32_t x12;
-  fiat_p256_mulx_u32(&x11, &x12, x8, (arg1[6]));
   uint32_t x13;
   uint32_t x14;
-  fiat_p256_mulx_u32(&x13, &x14, x8, (arg1[5]));
   uint32_t x15;
   uint32_t x16;
-  fiat_p256_mulx_u32(&x15, &x16, x8, (arg1[4]));
   uint32_t x17;
   uint32_t x18;
-  fiat_p256_mulx_u32(&x17, &x18, x8, (arg1[3]));
   uint32_t x19;
   uint32_t x20;
-  fiat_p256_mulx_u32(&x19, &x20, x8, (arg1[2]));
   uint32_t x21;
   uint32_t x22;
-  fiat_p256_mulx_u32(&x21, &x22, x8, (arg1[1]));
   uint32_t x23;
   uint32_t x24;
-  fiat_p256_mulx_u32(&x23, &x24, x8, (arg1[0]));
   uint32_t x25;
   fiat_p256_uint1 x26;
-  fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21);
   uint32_t x27;
   fiat_p256_uint1 x28;
-  fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19);
   uint32_t x29;
   fiat_p256_uint1 x30;
-  fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17);
   uint32_t x31;
   fiat_p256_uint1 x32;
-  fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15);
   uint32_t x33;
   fiat_p256_uint1 x34;
-  fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13);
   uint32_t x35;
   fiat_p256_uint1 x36;
-  fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11);
   uint32_t x37;
   fiat_p256_uint1 x38;
-  fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9);
-  uint32_t x39 = (x38 + x10);
+  uint32_t x39;
   uint32_t x40;
   uint32_t x41;
-  fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff));
   uint32_t x42;
   uint32_t x43;
-  fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff));
   uint32_t x44;
   uint32_t x45;
-  fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff));
   uint32_t x46;
   uint32_t x47;
-  fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff));
   uint32_t x48;
   fiat_p256_uint1 x49;
-  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44);
   uint32_t x50;
   fiat_p256_uint1 x51;
-  fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42);
-  uint32_t x52 = (x51 + x43);
+  uint32_t x52;
   uint32_t x53;
   fiat_p256_uint1 x54;
-  fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46);
   uint32_t x55;
   fiat_p256_uint1 x56;
-  fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48);
   uint32_t x57;
   fiat_p256_uint1 x58;
-  fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50);
   uint32_t x59;
   fiat_p256_uint1 x60;
-  fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52);
   uint32_t x61;
   fiat_p256_uint1 x62;
-  fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0);
   uint32_t x63;
   fiat_p256_uint1 x64;
-  fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0);
   uint32_t x65;
   fiat_p256_uint1 x66;
-  fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23);
   uint32_t x67;
   fiat_p256_uint1 x68;
-  fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40);
   uint32_t x69;
   fiat_p256_uint1 x70;
-  fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41);
   uint32_t x71;
   uint32_t x72;
-  fiat_p256_mulx_u32(&x71, &x72, x1, (arg1[7]));
   uint32_t x73;
   uint32_t x74;
-  fiat_p256_mulx_u32(&x73, &x74, x1, (arg1[6]));
   uint32_t x75;
   uint32_t x76;
-  fiat_p256_mulx_u32(&x75, &x76, x1, (arg1[5]));
   uint32_t x77;
   uint32_t x78;
-  fiat_p256_mulx_u32(&x77, &x78, x1, (arg1[4]));
   uint32_t x79;
   uint32_t x80;
-  fiat_p256_mulx_u32(&x79, &x80, x1, (arg1[3]));
   uint32_t x81;
   uint32_t x82;
-  fiat_p256_mulx_u32(&x81, &x82, x1, (arg1[2]));
   uint32_t x83;
   uint32_t x84;
-  fiat_p256_mulx_u32(&x83, &x84, x1, (arg1[1]));
   uint32_t x85;
   uint32_t x86;
-  fiat_p256_mulx_u32(&x85, &x86, x1, (arg1[0]));
   uint32_t x87;
   fiat_p256_uint1 x88;
-  fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83);
   uint32_t x89;
   fiat_p256_uint1 x90;
-  fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81);
   uint32_t x91;
   fiat_p256_uint1 x92;
-  fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79);
   uint32_t x93;
   fiat_p256_uint1 x94;
-  fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77);
   uint32_t x95;
   fiat_p256_uint1 x96;
-  fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75);
   uint32_t x97;
   fiat_p256_uint1 x98;
-  fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73);
   uint32_t x99;
   fiat_p256_uint1 x100;
-  fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71);
-  uint32_t x101 = (x100 + x72);
+  uint32_t x101;
   uint32_t x102;
   fiat_p256_uint1 x103;
-  fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85);
   uint32_t x104;
   fiat_p256_uint1 x105;
-  fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87);
   uint32_t x106;
   fiat_p256_uint1 x107;
-  fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89);
   uint32_t x108;
   fiat_p256_uint1 x109;
-  fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91);
   uint32_t x110;
   fiat_p256_uint1 x111;
-  fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93);
   uint32_t x112;
   fiat_p256_uint1 x113;
-  fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95);
   uint32_t x114;
   fiat_p256_uint1 x115;
-  fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97);
   uint32_t x116;
   fiat_p256_uint1 x117;
-  fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99);
   uint32_t x118;
   fiat_p256_uint1 x119;
-  fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101);
   uint32_t x120;
   uint32_t x121;
-  fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff));
   uint32_t x122;
   uint32_t x123;
-  fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff));
   uint32_t x124;
   uint32_t x125;
-  fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff));
   uint32_t x126;
   uint32_t x127;
-  fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff));
   uint32_t x128;
   fiat_p256_uint1 x129;
-  fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124);
   uint32_t x130;
   fiat_p256_uint1 x131;
-  fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122);
-  uint32_t x132 = (x131 + x123);
+  uint32_t x132;
   uint32_t x133;
   fiat_p256_uint1 x134;
-  fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126);
   uint32_t x135;
   fiat_p256_uint1 x136;
-  fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128);
   uint32_t x137;
   fiat_p256_uint1 x138;
-  fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130);
   uint32_t x139;
   fiat_p256_uint1 x140;
-  fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132);
   uint32_t x141;
   fiat_p256_uint1 x142;
-  fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0);
   uint32_t x143;
   fiat_p256_uint1 x144;
-  fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0);
   uint32_t x145;
   fiat_p256_uint1 x146;
-  fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102);
   uint32_t x147;
   fiat_p256_uint1 x148;
-  fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120);
   uint32_t x149;
   fiat_p256_uint1 x150;
-  fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121);
-  uint32_t x151 = ((uint32_t)x150 + x119);
+  uint32_t x151;
   uint32_t x152;
   uint32_t x153;
-  fiat_p256_mulx_u32(&x152, &x153, x2, (arg1[7]));
   uint32_t x154;
   uint32_t x155;
-  fiat_p256_mulx_u32(&x154, &x155, x2, (arg1[6]));
   uint32_t x156;
   uint32_t x157;
-  fiat_p256_mulx_u32(&x156, &x157, x2, (arg1[5]));
   uint32_t x158;
   uint32_t x159;
-  fiat_p256_mulx_u32(&x158, &x159, x2, (arg1[4]));
   uint32_t x160;
   uint32_t x161;
-  fiat_p256_mulx_u32(&x160, &x161, x2, (arg1[3]));
   uint32_t x162;
   uint32_t x163;
-  fiat_p256_mulx_u32(&x162, &x163, x2, (arg1[2]));
   uint32_t x164;
   uint32_t x165;
-  fiat_p256_mulx_u32(&x164, &x165, x2, (arg1[1]));
   uint32_t x166;
   uint32_t x167;
-  fiat_p256_mulx_u32(&x166, &x167, x2, (arg1[0]));
   uint32_t x168;
   fiat_p256_uint1 x169;
-  fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164);
   uint32_t x170;
   fiat_p256_uint1 x171;
-  fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162);
   uint32_t x172;
   fiat_p256_uint1 x173;
-  fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160);
   uint32_t x174;
   fiat_p256_uint1 x175;
-  fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158);
   uint32_t x176;
   fiat_p256_uint1 x177;
-  fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156);
   uint32_t x178;
   fiat_p256_uint1 x179;
-  fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154);
   uint32_t x180;
   fiat_p256_uint1 x181;
-  fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152);
-  uint32_t x182 = (x181 + x153);
+  uint32_t x182;
   uint32_t x183;
   fiat_p256_uint1 x184;
-  fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166);
   uint32_t x185;
   fiat_p256_uint1 x186;
-  fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168);
   uint32_t x187;
   fiat_p256_uint1 x188;
-  fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170);
   uint32_t x189;
   fiat_p256_uint1 x190;
-  fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172);
   uint32_t x191;
   fiat_p256_uint1 x192;
-  fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174);
   uint32_t x193;
   fiat_p256_uint1 x194;
-  fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176);
   uint32_t x195;
   fiat_p256_uint1 x196;
-  fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178);
   uint32_t x197;
   fiat_p256_uint1 x198;
-  fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180);
   uint32_t x199;
   fiat_p256_uint1 x200;
-  fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182);
   uint32_t x201;
   uint32_t x202;
-  fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff));
   uint32_t x203;
   uint32_t x204;
-  fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff));
   uint32_t x205;
   uint32_t x206;
-  fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff));
   uint32_t x207;
   uint32_t x208;
-  fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff));
   uint32_t x209;
   fiat_p256_uint1 x210;
-  fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205);
   uint32_t x211;
   fiat_p256_uint1 x212;
-  fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203);
-  uint32_t x213 = (x212 + x204);
+  uint32_t x213;
   uint32_t x214;
   fiat_p256_uint1 x215;
-  fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207);
   uint32_t x216;
   fiat_p256_uint1 x217;
-  fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209);
   uint32_t x218;
   fiat_p256_uint1 x219;
-  fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211);
   uint32_t x220;
   fiat_p256_uint1 x221;
-  fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213);
   uint32_t x222;
   fiat_p256_uint1 x223;
-  fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0);
   uint32_t x224;
   fiat_p256_uint1 x225;
-  fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0);
   uint32_t x226;
   fiat_p256_uint1 x227;
-  fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183);
   uint32_t x228;
   fiat_p256_uint1 x229;
-  fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201);
   uint32_t x230;
   fiat_p256_uint1 x231;
-  fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202);
-  uint32_t x232 = ((uint32_t)x231 + x200);
+  uint32_t x232;
   uint32_t x233;
   uint32_t x234;
-  fiat_p256_mulx_u32(&x233, &x234, x3, (arg1[7]));
   uint32_t x235;
   uint32_t x236;
-  fiat_p256_mulx_u32(&x235, &x236, x3, (arg1[6]));
   uint32_t x237;
   uint32_t x238;
-  fiat_p256_mulx_u32(&x237, &x238, x3, (arg1[5]));
   uint32_t x239;
   uint32_t x240;
-  fiat_p256_mulx_u32(&x239, &x240, x3, (arg1[4]));
   uint32_t x241;
   uint32_t x242;
-  fiat_p256_mulx_u32(&x241, &x242, x3, (arg1[3]));
   uint32_t x243;
   uint32_t x244;
-  fiat_p256_mulx_u32(&x243, &x244, x3, (arg1[2]));
   uint32_t x245;
   uint32_t x246;
-  fiat_p256_mulx_u32(&x245, &x246, x3, (arg1[1]));
   uint32_t x247;
   uint32_t x248;
-  fiat_p256_mulx_u32(&x247, &x248, x3, (arg1[0]));
   uint32_t x249;
   fiat_p256_uint1 x250;
-  fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245);
   uint32_t x251;
   fiat_p256_uint1 x252;
-  fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243);
   uint32_t x253;
   fiat_p256_uint1 x254;
-  fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241);
   uint32_t x255;
   fiat_p256_uint1 x256;
-  fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239);
   uint32_t x257;
   fiat_p256_uint1 x258;
-  fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237);
   uint32_t x259;
   fiat_p256_uint1 x260;
-  fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235);
   uint32_t x261;
   fiat_p256_uint1 x262;
-  fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233);
-  uint32_t x263 = (x262 + x234);
+  uint32_t x263;
   uint32_t x264;
   fiat_p256_uint1 x265;
-  fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247);
   uint32_t x266;
   fiat_p256_uint1 x267;
-  fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249);
   uint32_t x268;
   fiat_p256_uint1 x269;
-  fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251);
   uint32_t x270;
   fiat_p256_uint1 x271;
-  fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253);
   uint32_t x272;
   fiat_p256_uint1 x273;
-  fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255);
   uint32_t x274;
   fiat_p256_uint1 x275;
-  fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257);
   uint32_t x276;
   fiat_p256_uint1 x277;
-  fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259);
   uint32_t x278;
   fiat_p256_uint1 x279;
-  fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261);
   uint32_t x280;
   fiat_p256_uint1 x281;
-  fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263);
   uint32_t x282;
   uint32_t x283;
-  fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff));
   uint32_t x284;
   uint32_t x285;
-  fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff));
   uint32_t x286;
   uint32_t x287;
-  fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff));
   uint32_t x288;
   uint32_t x289;
-  fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff));
   uint32_t x290;
   fiat_p256_uint1 x291;
-  fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286);
   uint32_t x292;
   fiat_p256_uint1 x293;
-  fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284);
-  uint32_t x294 = (x293 + x285);
+  uint32_t x294;
   uint32_t x295;
   fiat_p256_uint1 x296;
-  fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288);
   uint32_t x297;
   fiat_p256_uint1 x298;
-  fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290);
   uint32_t x299;
   fiat_p256_uint1 x300;
-  fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292);
   uint32_t x301;
   fiat_p256_uint1 x302;
-  fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294);
   uint32_t x303;
   fiat_p256_uint1 x304;
-  fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0);
   uint32_t x305;
   fiat_p256_uint1 x306;
-  fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0);
   uint32_t x307;
   fiat_p256_uint1 x308;
-  fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264);
   uint32_t x309;
   fiat_p256_uint1 x310;
-  fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282);
   uint32_t x311;
   fiat_p256_uint1 x312;
-  fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283);
-  uint32_t x313 = ((uint32_t)x312 + x281);
+  uint32_t x313;
   uint32_t x314;
   uint32_t x315;
-  fiat_p256_mulx_u32(&x314, &x315, x4, (arg1[7]));
   uint32_t x316;
   uint32_t x317;
-  fiat_p256_mulx_u32(&x316, &x317, x4, (arg1[6]));
   uint32_t x318;
   uint32_t x319;
-  fiat_p256_mulx_u32(&x318, &x319, x4, (arg1[5]));
   uint32_t x320;
   uint32_t x321;
-  fiat_p256_mulx_u32(&x320, &x321, x4, (arg1[4]));
   uint32_t x322;
   uint32_t x323;
-  fiat_p256_mulx_u32(&x322, &x323, x4, (arg1[3]));
   uint32_t x324;
   uint32_t x325;
-  fiat_p256_mulx_u32(&x324, &x325, x4, (arg1[2]));
   uint32_t x326;
   uint32_t x327;
-  fiat_p256_mulx_u32(&x326, &x327, x4, (arg1[1]));
   uint32_t x328;
   uint32_t x329;
-  fiat_p256_mulx_u32(&x328, &x329, x4, (arg1[0]));
   uint32_t x330;
   fiat_p256_uint1 x331;
-  fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326);
   uint32_t x332;
   fiat_p256_uint1 x333;
-  fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324);
   uint32_t x334;
   fiat_p256_uint1 x335;
-  fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322);
   uint32_t x336;
   fiat_p256_uint1 x337;
-  fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320);
   uint32_t x338;
   fiat_p256_uint1 x339;
-  fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318);
   uint32_t x340;
   fiat_p256_uint1 x341;
-  fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316);
   uint32_t x342;
   fiat_p256_uint1 x343;
-  fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314);
-  uint32_t x344 = (x343 + x315);
+  uint32_t x344;
   uint32_t x345;
   fiat_p256_uint1 x346;
-  fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328);
   uint32_t x347;
   fiat_p256_uint1 x348;
-  fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330);
   uint32_t x349;
   fiat_p256_uint1 x350;
-  fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332);
   uint32_t x351;
   fiat_p256_uint1 x352;
-  fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334);
   uint32_t x353;
   fiat_p256_uint1 x354;
-  fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336);
   uint32_t x355;
   fiat_p256_uint1 x356;
-  fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338);
   uint32_t x357;
   fiat_p256_uint1 x358;
-  fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340);
   uint32_t x359;
   fiat_p256_uint1 x360;
-  fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342);
   uint32_t x361;
   fiat_p256_uint1 x362;
-  fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344);
   uint32_t x363;
   uint32_t x364;
-  fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff));
   uint32_t x365;
   uint32_t x366;
-  fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff));
   uint32_t x367;
   uint32_t x368;
-  fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff));
   uint32_t x369;
   uint32_t x370;
-  fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff));
   uint32_t x371;
   fiat_p256_uint1 x372;
-  fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367);
   uint32_t x373;
   fiat_p256_uint1 x374;
-  fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365);
-  uint32_t x375 = (x374 + x366);
+  uint32_t x375;
   uint32_t x376;
   fiat_p256_uint1 x377;
-  fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369);
   uint32_t x378;
   fiat_p256_uint1 x379;
-  fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371);
   uint32_t x380;
   fiat_p256_uint1 x381;
-  fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373);
   uint32_t x382;
   fiat_p256_uint1 x383;
-  fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375);
   uint32_t x384;
   fiat_p256_uint1 x385;
-  fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0);
   uint32_t x386;
   fiat_p256_uint1 x387;
-  fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0);
   uint32_t x388;
   fiat_p256_uint1 x389;
-  fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345);
   uint32_t x390;
   fiat_p256_uint1 x391;
-  fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363);
   uint32_t x392;
   fiat_p256_uint1 x393;
-  fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364);
-  uint32_t x394 = ((uint32_t)x393 + x362);
+  uint32_t x394;
   uint32_t x395;
   uint32_t x396;
-  fiat_p256_mulx_u32(&x395, &x396, x5, (arg1[7]));
   uint32_t x397;
   uint32_t x398;
-  fiat_p256_mulx_u32(&x397, &x398, x5, (arg1[6]));
   uint32_t x399;
   uint32_t x400;
-  fiat_p256_mulx_u32(&x399, &x400, x5, (arg1[5]));
   uint32_t x401;
   uint32_t x402;
-  fiat_p256_mulx_u32(&x401, &x402, x5, (arg1[4]));
   uint32_t x403;
   uint32_t x404;
-  fiat_p256_mulx_u32(&x403, &x404, x5, (arg1[3]));
   uint32_t x405;
   uint32_t x406;
-  fiat_p256_mulx_u32(&x405, &x406, x5, (arg1[2]));
   uint32_t x407;
   uint32_t x408;
-  fiat_p256_mulx_u32(&x407, &x408, x5, (arg1[1]));
   uint32_t x409;
   uint32_t x410;
-  fiat_p256_mulx_u32(&x409, &x410, x5, (arg1[0]));
   uint32_t x411;
   fiat_p256_uint1 x412;
-  fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407);
   uint32_t x413;
   fiat_p256_uint1 x414;
-  fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405);
   uint32_t x415;
   fiat_p256_uint1 x416;
-  fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403);
   uint32_t x417;
   fiat_p256_uint1 x418;
-  fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401);
   uint32_t x419;
   fiat_p256_uint1 x420;
-  fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399);
   uint32_t x421;
   fiat_p256_uint1 x422;
-  fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397);
   uint32_t x423;
   fiat_p256_uint1 x424;
-  fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395);
-  uint32_t x425 = (x424 + x396);
+  uint32_t x425;
   uint32_t x426;
   fiat_p256_uint1 x427;
-  fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409);
   uint32_t x428;
   fiat_p256_uint1 x429;
-  fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411);
   uint32_t x430;
   fiat_p256_uint1 x431;
-  fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413);
   uint32_t x432;
   fiat_p256_uint1 x433;
-  fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415);
   uint32_t x434;
   fiat_p256_uint1 x435;
-  fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417);
   uint32_t x436;
   fiat_p256_uint1 x437;
-  fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419);
   uint32_t x438;
   fiat_p256_uint1 x439;
-  fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421);
   uint32_t x440;
   fiat_p256_uint1 x441;
-  fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423);
   uint32_t x442;
   fiat_p256_uint1 x443;
-  fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425);
   uint32_t x444;
   uint32_t x445;
-  fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff));
   uint32_t x446;
   uint32_t x447;
-  fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff));
   uint32_t x448;
   uint32_t x449;
-  fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff));
   uint32_t x450;
   uint32_t x451;
-  fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff));
   uint32_t x452;
   fiat_p256_uint1 x453;
-  fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448);
   uint32_t x454;
   fiat_p256_uint1 x455;
-  fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446);
-  uint32_t x456 = (x455 + x447);
+  uint32_t x456;
   uint32_t x457;
   fiat_p256_uint1 x458;
-  fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450);
   uint32_t x459;
   fiat_p256_uint1 x460;
-  fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452);
   uint32_t x461;
   fiat_p256_uint1 x462;
-  fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454);
   uint32_t x463;
   fiat_p256_uint1 x464;
-  fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456);
   uint32_t x465;
   fiat_p256_uint1 x466;
-  fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0);
   uint32_t x467;
   fiat_p256_uint1 x468;
-  fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0);
   uint32_t x469;
   fiat_p256_uint1 x470;
-  fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426);
   uint32_t x471;
   fiat_p256_uint1 x472;
-  fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444);
   uint32_t x473;
   fiat_p256_uint1 x474;
-  fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445);
-  uint32_t x475 = ((uint32_t)x474 + x443);
+  uint32_t x475;
   uint32_t x476;
   uint32_t x477;
-  fiat_p256_mulx_u32(&x476, &x477, x6, (arg1[7]));
   uint32_t x478;
   uint32_t x479;
-  fiat_p256_mulx_u32(&x478, &x479, x6, (arg1[6]));
   uint32_t x480;
   uint32_t x481;
-  fiat_p256_mulx_u32(&x480, &x481, x6, (arg1[5]));
   uint32_t x482;
   uint32_t x483;
-  fiat_p256_mulx_u32(&x482, &x483, x6, (arg1[4]));
   uint32_t x484;
   uint32_t x485;
-  fiat_p256_mulx_u32(&x484, &x485, x6, (arg1[3]));
   uint32_t x486;
   uint32_t x487;
-  fiat_p256_mulx_u32(&x486, &x487, x6, (arg1[2]));
   uint32_t x488;
   uint32_t x489;
-  fiat_p256_mulx_u32(&x488, &x489, x6, (arg1[1]));
   uint32_t x490;
   uint32_t x491;
-  fiat_p256_mulx_u32(&x490, &x491, x6, (arg1[0]));
   uint32_t x492;
   fiat_p256_uint1 x493;
-  fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488);
   uint32_t x494;
   fiat_p256_uint1 x495;
-  fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486);
   uint32_t x496;
   fiat_p256_uint1 x497;
-  fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484);
   uint32_t x498;
   fiat_p256_uint1 x499;
-  fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482);
   uint32_t x500;
   fiat_p256_uint1 x501;
-  fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480);
   uint32_t x502;
   fiat_p256_uint1 x503;
-  fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478);
   uint32_t x504;
   fiat_p256_uint1 x505;
-  fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476);
-  uint32_t x506 = (x505 + x477);
+  uint32_t x506;
   uint32_t x507;
   fiat_p256_uint1 x508;
-  fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490);
   uint32_t x509;
   fiat_p256_uint1 x510;
-  fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492);
   uint32_t x511;
   fiat_p256_uint1 x512;
-  fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494);
   uint32_t x513;
   fiat_p256_uint1 x514;
-  fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496);
   uint32_t x515;
   fiat_p256_uint1 x516;
-  fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498);
   uint32_t x517;
   fiat_p256_uint1 x518;
-  fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500);
   uint32_t x519;
   fiat_p256_uint1 x520;
-  fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502);
   uint32_t x521;
   fiat_p256_uint1 x522;
-  fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504);
   uint32_t x523;
   fiat_p256_uint1 x524;
-  fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506);
   uint32_t x525;
   uint32_t x526;
-  fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff));
   uint32_t x527;
   uint32_t x528;
-  fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff));
   uint32_t x529;
   uint32_t x530;
-  fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff));
   uint32_t x531;
   uint32_t x532;
-  fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff));
   uint32_t x533;
   fiat_p256_uint1 x534;
-  fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529);
   uint32_t x535;
   fiat_p256_uint1 x536;
-  fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527);
-  uint32_t x537 = (x536 + x528);
+  uint32_t x537;
   uint32_t x538;
   fiat_p256_uint1 x539;
-  fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531);
   uint32_t x540;
   fiat_p256_uint1 x541;
-  fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533);
   uint32_t x542;
   fiat_p256_uint1 x543;
-  fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535);
   uint32_t x544;
   fiat_p256_uint1 x545;
-  fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537);
   uint32_t x546;
   fiat_p256_uint1 x547;
-  fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0);
   uint32_t x548;
   fiat_p256_uint1 x549;
-  fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0);
   uint32_t x550;
   fiat_p256_uint1 x551;
-  fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507);
   uint32_t x552;
   fiat_p256_uint1 x553;
-  fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525);
   uint32_t x554;
   fiat_p256_uint1 x555;
-  fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526);
-  uint32_t x556 = ((uint32_t)x555 + x524);
+  uint32_t x556;
   uint32_t x557;
   uint32_t x558;
-  fiat_p256_mulx_u32(&x557, &x558, x7, (arg1[7]));
   uint32_t x559;
   uint32_t x560;
-  fiat_p256_mulx_u32(&x559, &x560, x7, (arg1[6]));
   uint32_t x561;
   uint32_t x562;
-  fiat_p256_mulx_u32(&x561, &x562, x7, (arg1[5]));
   uint32_t x563;
   uint32_t x564;
-  fiat_p256_mulx_u32(&x563, &x564, x7, (arg1[4]));
   uint32_t x565;
   uint32_t x566;
-  fiat_p256_mulx_u32(&x565, &x566, x7, (arg1[3]));
   uint32_t x567;
   uint32_t x568;
-  fiat_p256_mulx_u32(&x567, &x568, x7, (arg1[2]));
   uint32_t x569;
   uint32_t x570;
-  fiat_p256_mulx_u32(&x569, &x570, x7, (arg1[1]));
   uint32_t x571;
   uint32_t x572;
-  fiat_p256_mulx_u32(&x571, &x572, x7, (arg1[0]));
   uint32_t x573;
   fiat_p256_uint1 x574;
-  fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569);
   uint32_t x575;
   fiat_p256_uint1 x576;
-  fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567);
   uint32_t x577;
   fiat_p256_uint1 x578;
-  fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565);
   uint32_t x579;
   fiat_p256_uint1 x580;
-  fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563);
   uint32_t x581;
   fiat_p256_uint1 x582;
-  fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561);
   uint32_t x583;
   fiat_p256_uint1 x584;
-  fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559);
   uint32_t x585;
   fiat_p256_uint1 x586;
-  fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557);
-  uint32_t x587 = (x586 + x558);
+  uint32_t x587;
   uint32_t x588;
   fiat_p256_uint1 x589;
-  fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571);
   uint32_t x590;
   fiat_p256_uint1 x591;
-  fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573);
   uint32_t x592;
   fiat_p256_uint1 x593;
-  fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575);
   uint32_t x594;
   fiat_p256_uint1 x595;
-  fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577);
   uint32_t x596;
   fiat_p256_uint1 x597;
-  fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579);
   uint32_t x598;
   fiat_p256_uint1 x599;
-  fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581);
   uint32_t x600;
   fiat_p256_uint1 x601;
-  fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583);
   uint32_t x602;
   fiat_p256_uint1 x603;
-  fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585);
   uint32_t x604;
   fiat_p256_uint1 x605;
-  fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587);
   uint32_t x606;
   uint32_t x607;
-  fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff));
   uint32_t x608;
   uint32_t x609;
-  fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff));
   uint32_t x610;
   uint32_t x611;
-  fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff));
   uint32_t x612;
   uint32_t x613;
-  fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff));
   uint32_t x614;
   fiat_p256_uint1 x615;
-  fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610);
   uint32_t x616;
   fiat_p256_uint1 x617;
-  fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608);
-  uint32_t x618 = (x617 + x609);
+  uint32_t x618;
   uint32_t x619;
   fiat_p256_uint1 x620;
-  fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612);
   uint32_t x621;
   fiat_p256_uint1 x622;
-  fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614);
   uint32_t x623;
   fiat_p256_uint1 x624;
-  fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616);
   uint32_t x625;
   fiat_p256_uint1 x626;
-  fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618);
   uint32_t x627;
   fiat_p256_uint1 x628;
-  fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0);
   uint32_t x629;
   fiat_p256_uint1 x630;
-  fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0);
   uint32_t x631;
   fiat_p256_uint1 x632;
-  fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588);
   uint32_t x633;
   fiat_p256_uint1 x634;
-  fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606);
   uint32_t x635;
   fiat_p256_uint1 x636;
-  fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607);
-  uint32_t x637 = ((uint32_t)x636 + x605);
+  uint32_t x637;
   uint32_t x638;
   fiat_p256_uint1 x639;
-  fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff));
   uint32_t x640;
   fiat_p256_uint1 x641;
-  fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff));
   uint32_t x642;
   fiat_p256_uint1 x643;
-  fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff));
   uint32_t x644;
   fiat_p256_uint1 x645;
-  fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0);
   uint32_t x646;
   fiat_p256_uint1 x647;
-  fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0);
   uint32_t x648;
   fiat_p256_uint1 x649;
-  fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0);
   uint32_t x650;
   fiat_p256_uint1 x651;
-  fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1);
   uint32_t x652;
   fiat_p256_uint1 x653;
-  fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff));
   uint32_t x654;
   fiat_p256_uint1 x655;
-  fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0);
   uint32_t x656;
-  fiat_p256_cmovznz_u32(&x656, x655, x638, x621);
   uint32_t x657;
-  fiat_p256_cmovznz_u32(&x657, x655, x640, x623);
   uint32_t x658;
-  fiat_p256_cmovznz_u32(&x658, x655, x642, x625);
   uint32_t x659;
-  fiat_p256_cmovznz_u32(&x659, x655, x644, x627);
   uint32_t x660;
-  fiat_p256_cmovznz_u32(&x660, x655, x646, x629);
   uint32_t x661;
-  fiat_p256_cmovznz_u32(&x661, x655, x648, x631);
   uint32_t x662;
-  fiat_p256_cmovznz_u32(&x662, x655, x650, x633);
   uint32_t x663;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[4]);
+  x5 = (arg1[5]);
+  x6 = (arg1[6]);
+  x7 = (arg1[7]);
+  x8 = (arg1[0]);
+  fiat_p256_mulx_u32(&x9, &x10, x8, (arg1[7]));
+  fiat_p256_mulx_u32(&x11, &x12, x8, (arg1[6]));
+  fiat_p256_mulx_u32(&x13, &x14, x8, (arg1[5]));
+  fiat_p256_mulx_u32(&x15, &x16, x8, (arg1[4]));
+  fiat_p256_mulx_u32(&x17, &x18, x8, (arg1[3]));
+  fiat_p256_mulx_u32(&x19, &x20, x8, (arg1[2]));
+  fiat_p256_mulx_u32(&x21, &x22, x8, (arg1[1]));
+  fiat_p256_mulx_u32(&x23, &x24, x8, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x25, &x26, 0x0, x24, x21);
+  fiat_p256_addcarryx_u32(&x27, &x28, x26, x22, x19);
+  fiat_p256_addcarryx_u32(&x29, &x30, x28, x20, x17);
+  fiat_p256_addcarryx_u32(&x31, &x32, x30, x18, x15);
+  fiat_p256_addcarryx_u32(&x33, &x34, x32, x16, x13);
+  fiat_p256_addcarryx_u32(&x35, &x36, x34, x14, x11);
+  fiat_p256_addcarryx_u32(&x37, &x38, x36, x12, x9);
+  x39 = (x38 + x10);
+  fiat_p256_mulx_u32(&x40, &x41, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x42, &x43, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x44, &x45, x23, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x46, &x47, x23, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x47, x44);
+  fiat_p256_addcarryx_u32(&x50, &x51, x49, x45, x42);
+  x52 = (x51 + x43);
+  fiat_p256_addcarryx_u32(&x53, &x54, 0x0, x23, x46);
+  fiat_p256_addcarryx_u32(&x55, &x56, x54, x25, x48);
+  fiat_p256_addcarryx_u32(&x57, &x58, x56, x27, x50);
+  fiat_p256_addcarryx_u32(&x59, &x60, x58, x29, x52);
+  fiat_p256_addcarryx_u32(&x61, &x62, x60, x31, 0x0);
+  fiat_p256_addcarryx_u32(&x63, &x64, x62, x33, 0x0);
+  fiat_p256_addcarryx_u32(&x65, &x66, x64, x35, x23);
+  fiat_p256_addcarryx_u32(&x67, &x68, x66, x37, x40);
+  fiat_p256_addcarryx_u32(&x69, &x70, x68, x39, x41);
+  fiat_p256_mulx_u32(&x71, &x72, x1, (arg1[7]));
+  fiat_p256_mulx_u32(&x73, &x74, x1, (arg1[6]));
+  fiat_p256_mulx_u32(&x75, &x76, x1, (arg1[5]));
+  fiat_p256_mulx_u32(&x77, &x78, x1, (arg1[4]));
+  fiat_p256_mulx_u32(&x79, &x80, x1, (arg1[3]));
+  fiat_p256_mulx_u32(&x81, &x82, x1, (arg1[2]));
+  fiat_p256_mulx_u32(&x83, &x84, x1, (arg1[1]));
+  fiat_p256_mulx_u32(&x85, &x86, x1, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x86, x83);
+  fiat_p256_addcarryx_u32(&x89, &x90, x88, x84, x81);
+  fiat_p256_addcarryx_u32(&x91, &x92, x90, x82, x79);
+  fiat_p256_addcarryx_u32(&x93, &x94, x92, x80, x77);
+  fiat_p256_addcarryx_u32(&x95, &x96, x94, x78, x75);
+  fiat_p256_addcarryx_u32(&x97, &x98, x96, x76, x73);
+  fiat_p256_addcarryx_u32(&x99, &x100, x98, x74, x71);
+  x101 = (x100 + x72);
+  fiat_p256_addcarryx_u32(&x102, &x103, 0x0, x55, x85);
+  fiat_p256_addcarryx_u32(&x104, &x105, x103, x57, x87);
+  fiat_p256_addcarryx_u32(&x106, &x107, x105, x59, x89);
+  fiat_p256_addcarryx_u32(&x108, &x109, x107, x61, x91);
+  fiat_p256_addcarryx_u32(&x110, &x111, x109, x63, x93);
+  fiat_p256_addcarryx_u32(&x112, &x113, x111, x65, x95);
+  fiat_p256_addcarryx_u32(&x114, &x115, x113, x67, x97);
+  fiat_p256_addcarryx_u32(&x116, &x117, x115, x69, x99);
+  fiat_p256_addcarryx_u32(&x118, &x119, x117, x70, x101);
+  fiat_p256_mulx_u32(&x120, &x121, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x122, &x123, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x124, &x125, x102, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x126, &x127, x102, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x128, &x129, 0x0, x127, x124);
+  fiat_p256_addcarryx_u32(&x130, &x131, x129, x125, x122);
+  x132 = (x131 + x123);
+  fiat_p256_addcarryx_u32(&x133, &x134, 0x0, x102, x126);
+  fiat_p256_addcarryx_u32(&x135, &x136, x134, x104, x128);
+  fiat_p256_addcarryx_u32(&x137, &x138, x136, x106, x130);
+  fiat_p256_addcarryx_u32(&x139, &x140, x138, x108, x132);
+  fiat_p256_addcarryx_u32(&x141, &x142, x140, x110, 0x0);
+  fiat_p256_addcarryx_u32(&x143, &x144, x142, x112, 0x0);
+  fiat_p256_addcarryx_u32(&x145, &x146, x144, x114, x102);
+  fiat_p256_addcarryx_u32(&x147, &x148, x146, x116, x120);
+  fiat_p256_addcarryx_u32(&x149, &x150, x148, x118, x121);
+  x151 = ((uint32_t)x150 + x119);
+  fiat_p256_mulx_u32(&x152, &x153, x2, (arg1[7]));
+  fiat_p256_mulx_u32(&x154, &x155, x2, (arg1[6]));
+  fiat_p256_mulx_u32(&x156, &x157, x2, (arg1[5]));
+  fiat_p256_mulx_u32(&x158, &x159, x2, (arg1[4]));
+  fiat_p256_mulx_u32(&x160, &x161, x2, (arg1[3]));
+  fiat_p256_mulx_u32(&x162, &x163, x2, (arg1[2]));
+  fiat_p256_mulx_u32(&x164, &x165, x2, (arg1[1]));
+  fiat_p256_mulx_u32(&x166, &x167, x2, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x168, &x169, 0x0, x167, x164);
+  fiat_p256_addcarryx_u32(&x170, &x171, x169, x165, x162);
+  fiat_p256_addcarryx_u32(&x172, &x173, x171, x163, x160);
+  fiat_p256_addcarryx_u32(&x174, &x175, x173, x161, x158);
+  fiat_p256_addcarryx_u32(&x176, &x177, x175, x159, x156);
+  fiat_p256_addcarryx_u32(&x178, &x179, x177, x157, x154);
+  fiat_p256_addcarryx_u32(&x180, &x181, x179, x155, x152);
+  x182 = (x181 + x153);
+  fiat_p256_addcarryx_u32(&x183, &x184, 0x0, x135, x166);
+  fiat_p256_addcarryx_u32(&x185, &x186, x184, x137, x168);
+  fiat_p256_addcarryx_u32(&x187, &x188, x186, x139, x170);
+  fiat_p256_addcarryx_u32(&x189, &x190, x188, x141, x172);
+  fiat_p256_addcarryx_u32(&x191, &x192, x190, x143, x174);
+  fiat_p256_addcarryx_u32(&x193, &x194, x192, x145, x176);
+  fiat_p256_addcarryx_u32(&x195, &x196, x194, x147, x178);
+  fiat_p256_addcarryx_u32(&x197, &x198, x196, x149, x180);
+  fiat_p256_addcarryx_u32(&x199, &x200, x198, x151, x182);
+  fiat_p256_mulx_u32(&x201, &x202, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x203, &x204, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x205, &x206, x183, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x207, &x208, x183, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x209, &x210, 0x0, x208, x205);
+  fiat_p256_addcarryx_u32(&x211, &x212, x210, x206, x203);
+  x213 = (x212 + x204);
+  fiat_p256_addcarryx_u32(&x214, &x215, 0x0, x183, x207);
+  fiat_p256_addcarryx_u32(&x216, &x217, x215, x185, x209);
+  fiat_p256_addcarryx_u32(&x218, &x219, x217, x187, x211);
+  fiat_p256_addcarryx_u32(&x220, &x221, x219, x189, x213);
+  fiat_p256_addcarryx_u32(&x222, &x223, x221, x191, 0x0);
+  fiat_p256_addcarryx_u32(&x224, &x225, x223, x193, 0x0);
+  fiat_p256_addcarryx_u32(&x226, &x227, x225, x195, x183);
+  fiat_p256_addcarryx_u32(&x228, &x229, x227, x197, x201);
+  fiat_p256_addcarryx_u32(&x230, &x231, x229, x199, x202);
+  x232 = ((uint32_t)x231 + x200);
+  fiat_p256_mulx_u32(&x233, &x234, x3, (arg1[7]));
+  fiat_p256_mulx_u32(&x235, &x236, x3, (arg1[6]));
+  fiat_p256_mulx_u32(&x237, &x238, x3, (arg1[5]));
+  fiat_p256_mulx_u32(&x239, &x240, x3, (arg1[4]));
+  fiat_p256_mulx_u32(&x241, &x242, x3, (arg1[3]));
+  fiat_p256_mulx_u32(&x243, &x244, x3, (arg1[2]));
+  fiat_p256_mulx_u32(&x245, &x246, x3, (arg1[1]));
+  fiat_p256_mulx_u32(&x247, &x248, x3, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x249, &x250, 0x0, x248, x245);
+  fiat_p256_addcarryx_u32(&x251, &x252, x250, x246, x243);
+  fiat_p256_addcarryx_u32(&x253, &x254, x252, x244, x241);
+  fiat_p256_addcarryx_u32(&x255, &x256, x254, x242, x239);
+  fiat_p256_addcarryx_u32(&x257, &x258, x256, x240, x237);
+  fiat_p256_addcarryx_u32(&x259, &x260, x258, x238, x235);
+  fiat_p256_addcarryx_u32(&x261, &x262, x260, x236, x233);
+  x263 = (x262 + x234);
+  fiat_p256_addcarryx_u32(&x264, &x265, 0x0, x216, x247);
+  fiat_p256_addcarryx_u32(&x266, &x267, x265, x218, x249);
+  fiat_p256_addcarryx_u32(&x268, &x269, x267, x220, x251);
+  fiat_p256_addcarryx_u32(&x270, &x271, x269, x222, x253);
+  fiat_p256_addcarryx_u32(&x272, &x273, x271, x224, x255);
+  fiat_p256_addcarryx_u32(&x274, &x275, x273, x226, x257);
+  fiat_p256_addcarryx_u32(&x276, &x277, x275, x228, x259);
+  fiat_p256_addcarryx_u32(&x278, &x279, x277, x230, x261);
+  fiat_p256_addcarryx_u32(&x280, &x281, x279, x232, x263);
+  fiat_p256_mulx_u32(&x282, &x283, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x284, &x285, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x286, &x287, x264, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x288, &x289, x264, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x290, &x291, 0x0, x289, x286);
+  fiat_p256_addcarryx_u32(&x292, &x293, x291, x287, x284);
+  x294 = (x293 + x285);
+  fiat_p256_addcarryx_u32(&x295, &x296, 0x0, x264, x288);
+  fiat_p256_addcarryx_u32(&x297, &x298, x296, x266, x290);
+  fiat_p256_addcarryx_u32(&x299, &x300, x298, x268, x292);
+  fiat_p256_addcarryx_u32(&x301, &x302, x300, x270, x294);
+  fiat_p256_addcarryx_u32(&x303, &x304, x302, x272, 0x0);
+  fiat_p256_addcarryx_u32(&x305, &x306, x304, x274, 0x0);
+  fiat_p256_addcarryx_u32(&x307, &x308, x306, x276, x264);
+  fiat_p256_addcarryx_u32(&x309, &x310, x308, x278, x282);
+  fiat_p256_addcarryx_u32(&x311, &x312, x310, x280, x283);
+  x313 = ((uint32_t)x312 + x281);
+  fiat_p256_mulx_u32(&x314, &x315, x4, (arg1[7]));
+  fiat_p256_mulx_u32(&x316, &x317, x4, (arg1[6]));
+  fiat_p256_mulx_u32(&x318, &x319, x4, (arg1[5]));
+  fiat_p256_mulx_u32(&x320, &x321, x4, (arg1[4]));
+  fiat_p256_mulx_u32(&x322, &x323, x4, (arg1[3]));
+  fiat_p256_mulx_u32(&x324, &x325, x4, (arg1[2]));
+  fiat_p256_mulx_u32(&x326, &x327, x4, (arg1[1]));
+  fiat_p256_mulx_u32(&x328, &x329, x4, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x330, &x331, 0x0, x329, x326);
+  fiat_p256_addcarryx_u32(&x332, &x333, x331, x327, x324);
+  fiat_p256_addcarryx_u32(&x334, &x335, x333, x325, x322);
+  fiat_p256_addcarryx_u32(&x336, &x337, x335, x323, x320);
+  fiat_p256_addcarryx_u32(&x338, &x339, x337, x321, x318);
+  fiat_p256_addcarryx_u32(&x340, &x341, x339, x319, x316);
+  fiat_p256_addcarryx_u32(&x342, &x343, x341, x317, x314);
+  x344 = (x343 + x315);
+  fiat_p256_addcarryx_u32(&x345, &x346, 0x0, x297, x328);
+  fiat_p256_addcarryx_u32(&x347, &x348, x346, x299, x330);
+  fiat_p256_addcarryx_u32(&x349, &x350, x348, x301, x332);
+  fiat_p256_addcarryx_u32(&x351, &x352, x350, x303, x334);
+  fiat_p256_addcarryx_u32(&x353, &x354, x352, x305, x336);
+  fiat_p256_addcarryx_u32(&x355, &x356, x354, x307, x338);
+  fiat_p256_addcarryx_u32(&x357, &x358, x356, x309, x340);
+  fiat_p256_addcarryx_u32(&x359, &x360, x358, x311, x342);
+  fiat_p256_addcarryx_u32(&x361, &x362, x360, x313, x344);
+  fiat_p256_mulx_u32(&x363, &x364, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x365, &x366, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x367, &x368, x345, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x369, &x370, x345, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x371, &x372, 0x0, x370, x367);
+  fiat_p256_addcarryx_u32(&x373, &x374, x372, x368, x365);
+  x375 = (x374 + x366);
+  fiat_p256_addcarryx_u32(&x376, &x377, 0x0, x345, x369);
+  fiat_p256_addcarryx_u32(&x378, &x379, x377, x347, x371);
+  fiat_p256_addcarryx_u32(&x380, &x381, x379, x349, x373);
+  fiat_p256_addcarryx_u32(&x382, &x383, x381, x351, x375);
+  fiat_p256_addcarryx_u32(&x384, &x385, x383, x353, 0x0);
+  fiat_p256_addcarryx_u32(&x386, &x387, x385, x355, 0x0);
+  fiat_p256_addcarryx_u32(&x388, &x389, x387, x357, x345);
+  fiat_p256_addcarryx_u32(&x390, &x391, x389, x359, x363);
+  fiat_p256_addcarryx_u32(&x392, &x393, x391, x361, x364);
+  x394 = ((uint32_t)x393 + x362);
+  fiat_p256_mulx_u32(&x395, &x396, x5, (arg1[7]));
+  fiat_p256_mulx_u32(&x397, &x398, x5, (arg1[6]));
+  fiat_p256_mulx_u32(&x399, &x400, x5, (arg1[5]));
+  fiat_p256_mulx_u32(&x401, &x402, x5, (arg1[4]));
+  fiat_p256_mulx_u32(&x403, &x404, x5, (arg1[3]));
+  fiat_p256_mulx_u32(&x405, &x406, x5, (arg1[2]));
+  fiat_p256_mulx_u32(&x407, &x408, x5, (arg1[1]));
+  fiat_p256_mulx_u32(&x409, &x410, x5, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x411, &x412, 0x0, x410, x407);
+  fiat_p256_addcarryx_u32(&x413, &x414, x412, x408, x405);
+  fiat_p256_addcarryx_u32(&x415, &x416, x414, x406, x403);
+  fiat_p256_addcarryx_u32(&x417, &x418, x416, x404, x401);
+  fiat_p256_addcarryx_u32(&x419, &x420, x418, x402, x399);
+  fiat_p256_addcarryx_u32(&x421, &x422, x420, x400, x397);
+  fiat_p256_addcarryx_u32(&x423, &x424, x422, x398, x395);
+  x425 = (x424 + x396);
+  fiat_p256_addcarryx_u32(&x426, &x427, 0x0, x378, x409);
+  fiat_p256_addcarryx_u32(&x428, &x429, x427, x380, x411);
+  fiat_p256_addcarryx_u32(&x430, &x431, x429, x382, x413);
+  fiat_p256_addcarryx_u32(&x432, &x433, x431, x384, x415);
+  fiat_p256_addcarryx_u32(&x434, &x435, x433, x386, x417);
+  fiat_p256_addcarryx_u32(&x436, &x437, x435, x388, x419);
+  fiat_p256_addcarryx_u32(&x438, &x439, x437, x390, x421);
+  fiat_p256_addcarryx_u32(&x440, &x441, x439, x392, x423);
+  fiat_p256_addcarryx_u32(&x442, &x443, x441, x394, x425);
+  fiat_p256_mulx_u32(&x444, &x445, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x446, &x447, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x448, &x449, x426, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x450, &x451, x426, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x452, &x453, 0x0, x451, x448);
+  fiat_p256_addcarryx_u32(&x454, &x455, x453, x449, x446);
+  x456 = (x455 + x447);
+  fiat_p256_addcarryx_u32(&x457, &x458, 0x0, x426, x450);
+  fiat_p256_addcarryx_u32(&x459, &x460, x458, x428, x452);
+  fiat_p256_addcarryx_u32(&x461, &x462, x460, x430, x454);
+  fiat_p256_addcarryx_u32(&x463, &x464, x462, x432, x456);
+  fiat_p256_addcarryx_u32(&x465, &x466, x464, x434, 0x0);
+  fiat_p256_addcarryx_u32(&x467, &x468, x466, x436, 0x0);
+  fiat_p256_addcarryx_u32(&x469, &x470, x468, x438, x426);
+  fiat_p256_addcarryx_u32(&x471, &x472, x470, x440, x444);
+  fiat_p256_addcarryx_u32(&x473, &x474, x472, x442, x445);
+  x475 = ((uint32_t)x474 + x443);
+  fiat_p256_mulx_u32(&x476, &x477, x6, (arg1[7]));
+  fiat_p256_mulx_u32(&x478, &x479, x6, (arg1[6]));
+  fiat_p256_mulx_u32(&x480, &x481, x6, (arg1[5]));
+  fiat_p256_mulx_u32(&x482, &x483, x6, (arg1[4]));
+  fiat_p256_mulx_u32(&x484, &x485, x6, (arg1[3]));
+  fiat_p256_mulx_u32(&x486, &x487, x6, (arg1[2]));
+  fiat_p256_mulx_u32(&x488, &x489, x6, (arg1[1]));
+  fiat_p256_mulx_u32(&x490, &x491, x6, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x492, &x493, 0x0, x491, x488);
+  fiat_p256_addcarryx_u32(&x494, &x495, x493, x489, x486);
+  fiat_p256_addcarryx_u32(&x496, &x497, x495, x487, x484);
+  fiat_p256_addcarryx_u32(&x498, &x499, x497, x485, x482);
+  fiat_p256_addcarryx_u32(&x500, &x501, x499, x483, x480);
+  fiat_p256_addcarryx_u32(&x502, &x503, x501, x481, x478);
+  fiat_p256_addcarryx_u32(&x504, &x505, x503, x479, x476);
+  x506 = (x505 + x477);
+  fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x459, x490);
+  fiat_p256_addcarryx_u32(&x509, &x510, x508, x461, x492);
+  fiat_p256_addcarryx_u32(&x511, &x512, x510, x463, x494);
+  fiat_p256_addcarryx_u32(&x513, &x514, x512, x465, x496);
+  fiat_p256_addcarryx_u32(&x515, &x516, x514, x467, x498);
+  fiat_p256_addcarryx_u32(&x517, &x518, x516, x469, x500);
+  fiat_p256_addcarryx_u32(&x519, &x520, x518, x471, x502);
+  fiat_p256_addcarryx_u32(&x521, &x522, x520, x473, x504);
+  fiat_p256_addcarryx_u32(&x523, &x524, x522, x475, x506);
+  fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x531, &x532, x507, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x533, &x534, 0x0, x532, x529);
+  fiat_p256_addcarryx_u32(&x535, &x536, x534, x530, x527);
+  x537 = (x536 + x528);
+  fiat_p256_addcarryx_u32(&x538, &x539, 0x0, x507, x531);
+  fiat_p256_addcarryx_u32(&x540, &x541, x539, x509, x533);
+  fiat_p256_addcarryx_u32(&x542, &x543, x541, x511, x535);
+  fiat_p256_addcarryx_u32(&x544, &x545, x543, x513, x537);
+  fiat_p256_addcarryx_u32(&x546, &x547, x545, x515, 0x0);
+  fiat_p256_addcarryx_u32(&x548, &x549, x547, x517, 0x0);
+  fiat_p256_addcarryx_u32(&x550, &x551, x549, x519, x507);
+  fiat_p256_addcarryx_u32(&x552, &x553, x551, x521, x525);
+  fiat_p256_addcarryx_u32(&x554, &x555, x553, x523, x526);
+  x556 = ((uint32_t)x555 + x524);
+  fiat_p256_mulx_u32(&x557, &x558, x7, (arg1[7]));
+  fiat_p256_mulx_u32(&x559, &x560, x7, (arg1[6]));
+  fiat_p256_mulx_u32(&x561, &x562, x7, (arg1[5]));
+  fiat_p256_mulx_u32(&x563, &x564, x7, (arg1[4]));
+  fiat_p256_mulx_u32(&x565, &x566, x7, (arg1[3]));
+  fiat_p256_mulx_u32(&x567, &x568, x7, (arg1[2]));
+  fiat_p256_mulx_u32(&x569, &x570, x7, (arg1[1]));
+  fiat_p256_mulx_u32(&x571, &x572, x7, (arg1[0]));
+  fiat_p256_addcarryx_u32(&x573, &x574, 0x0, x572, x569);
+  fiat_p256_addcarryx_u32(&x575, &x576, x574, x570, x567);
+  fiat_p256_addcarryx_u32(&x577, &x578, x576, x568, x565);
+  fiat_p256_addcarryx_u32(&x579, &x580, x578, x566, x563);
+  fiat_p256_addcarryx_u32(&x581, &x582, x580, x564, x561);
+  fiat_p256_addcarryx_u32(&x583, &x584, x582, x562, x559);
+  fiat_p256_addcarryx_u32(&x585, &x586, x584, x560, x557);
+  x587 = (x586 + x558);
+  fiat_p256_addcarryx_u32(&x588, &x589, 0x0, x540, x571);
+  fiat_p256_addcarryx_u32(&x590, &x591, x589, x542, x573);
+  fiat_p256_addcarryx_u32(&x592, &x593, x591, x544, x575);
+  fiat_p256_addcarryx_u32(&x594, &x595, x593, x546, x577);
+  fiat_p256_addcarryx_u32(&x596, &x597, x595, x548, x579);
+  fiat_p256_addcarryx_u32(&x598, &x599, x597, x550, x581);
+  fiat_p256_addcarryx_u32(&x600, &x601, x599, x552, x583);
+  fiat_p256_addcarryx_u32(&x602, &x603, x601, x554, x585);
+  fiat_p256_addcarryx_u32(&x604, &x605, x603, x556, x587);
+  fiat_p256_mulx_u32(&x606, &x607, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x608, &x609, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x610, &x611, x588, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x612, &x613, x588, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x614, &x615, 0x0, x613, x610);
+  fiat_p256_addcarryx_u32(&x616, &x617, x615, x611, x608);
+  x618 = (x617 + x609);
+  fiat_p256_addcarryx_u32(&x619, &x620, 0x0, x588, x612);
+  fiat_p256_addcarryx_u32(&x621, &x622, x620, x590, x614);
+  fiat_p256_addcarryx_u32(&x623, &x624, x622, x592, x616);
+  fiat_p256_addcarryx_u32(&x625, &x626, x624, x594, x618);
+  fiat_p256_addcarryx_u32(&x627, &x628, x626, x596, 0x0);
+  fiat_p256_addcarryx_u32(&x629, &x630, x628, x598, 0x0);
+  fiat_p256_addcarryx_u32(&x631, &x632, x630, x600, x588);
+  fiat_p256_addcarryx_u32(&x633, &x634, x632, x602, x606);
+  fiat_p256_addcarryx_u32(&x635, &x636, x634, x604, x607);
+  x637 = ((uint32_t)x636 + x605);
+  fiat_p256_subborrowx_u32(&x638, &x639, 0x0, x621, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x640, &x641, x639, x623, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x642, &x643, x641, x625, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x644, &x645, x643, x627, 0x0);
+  fiat_p256_subborrowx_u32(&x646, &x647, x645, x629, 0x0);
+  fiat_p256_subborrowx_u32(&x648, &x649, x647, x631, 0x0);
+  fiat_p256_subborrowx_u32(&x650, &x651, x649, x633, 0x1);
+  fiat_p256_subborrowx_u32(&x652, &x653, x651, x635, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x654, &x655, x653, x637, 0x0);
+  fiat_p256_cmovznz_u32(&x656, x655, x638, x621);
+  fiat_p256_cmovznz_u32(&x657, x655, x640, x623);
+  fiat_p256_cmovznz_u32(&x658, x655, x642, x625);
+  fiat_p256_cmovznz_u32(&x659, x655, x644, x627);
+  fiat_p256_cmovznz_u32(&x660, x655, x646, x629);
+  fiat_p256_cmovznz_u32(&x661, x655, x648, x631);
+  fiat_p256_cmovznz_u32(&x662, x655, x650, x633);
   fiat_p256_cmovznz_u32(&x663, x655, x652, x635);
   out1[0] = x656;
   out1[1] = x657;
@@ -2128,6 +2221,7 @@
 
 /*
  * The function fiat_p256_add adds two field elements in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  *   0 ≤ eval arg2 < m
@@ -2135,79 +2229,74 @@
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- *   arg2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_add(uint32_t out1[8], const uint32_t arg1[8], const uint32_t arg2[8]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
   uint32_t x1;
   fiat_p256_uint1 x2;
-  fiat_p256_addcarryx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
   uint32_t x3;
   fiat_p256_uint1 x4;
-  fiat_p256_addcarryx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1]));
   uint32_t x5;
   fiat_p256_uint1 x6;
-  fiat_p256_addcarryx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2]));
   uint32_t x7;
   fiat_p256_uint1 x8;
-  fiat_p256_addcarryx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3]));
   uint32_t x9;
   fiat_p256_uint1 x10;
-  fiat_p256_addcarryx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4]));
   uint32_t x11;
   fiat_p256_uint1 x12;
-  fiat_p256_addcarryx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5]));
   uint32_t x13;
   fiat_p256_uint1 x14;
-  fiat_p256_addcarryx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6]));
   uint32_t x15;
   fiat_p256_uint1 x16;
-  fiat_p256_addcarryx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7]));
   uint32_t x17;
   fiat_p256_uint1 x18;
-  fiat_p256_subborrowx_u32(&x17, &x18, 0x0, x1, UINT32_C(0xffffffff));
   uint32_t x19;
   fiat_p256_uint1 x20;
-  fiat_p256_subborrowx_u32(&x19, &x20, x18, x3, UINT32_C(0xffffffff));
   uint32_t x21;
   fiat_p256_uint1 x22;
-  fiat_p256_subborrowx_u32(&x21, &x22, x20, x5, UINT32_C(0xffffffff));
   uint32_t x23;
   fiat_p256_uint1 x24;
-  fiat_p256_subborrowx_u32(&x23, &x24, x22, x7, 0x0);
   uint32_t x25;
   fiat_p256_uint1 x26;
-  fiat_p256_subborrowx_u32(&x25, &x26, x24, x9, 0x0);
   uint32_t x27;
   fiat_p256_uint1 x28;
-  fiat_p256_subborrowx_u32(&x27, &x28, x26, x11, 0x0);
   uint32_t x29;
   fiat_p256_uint1 x30;
-  fiat_p256_subborrowx_u32(&x29, &x30, x28, x13, 0x1);
   uint32_t x31;
   fiat_p256_uint1 x32;
-  fiat_p256_subborrowx_u32(&x31, &x32, x30, x15, UINT32_C(0xffffffff));
   uint32_t x33;
   fiat_p256_uint1 x34;
-  fiat_p256_subborrowx_u32(&x33, &x34, x32, x16, 0x0);
   uint32_t x35;
-  fiat_p256_cmovznz_u32(&x35, x34, x17, x1);
   uint32_t x36;
-  fiat_p256_cmovznz_u32(&x36, x34, x19, x3);
   uint32_t x37;
-  fiat_p256_cmovznz_u32(&x37, x34, x21, x5);
   uint32_t x38;
-  fiat_p256_cmovznz_u32(&x38, x34, x23, x7);
   uint32_t x39;
-  fiat_p256_cmovznz_u32(&x39, x34, x25, x9);
   uint32_t x40;
-  fiat_p256_cmovznz_u32(&x40, x34, x27, x11);
   uint32_t x41;
-  fiat_p256_cmovznz_u32(&x41, x34, x29, x13);
   uint32_t x42;
+  fiat_p256_addcarryx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_addcarryx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_addcarryx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_addcarryx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_addcarryx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4]));
+  fiat_p256_addcarryx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5]));
+  fiat_p256_addcarryx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6]));
+  fiat_p256_addcarryx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7]));
+  fiat_p256_subborrowx_u32(&x17, &x18, 0x0, x1, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x19, &x20, x18, x3, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x21, &x22, x20, x5, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x23, &x24, x22, x7, 0x0);
+  fiat_p256_subborrowx_u32(&x25, &x26, x24, x9, 0x0);
+  fiat_p256_subborrowx_u32(&x27, &x28, x26, x11, 0x0);
+  fiat_p256_subborrowx_u32(&x29, &x30, x28, x13, 0x1);
+  fiat_p256_subborrowx_u32(&x31, &x32, x30, x15, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x33, &x34, x32, x16, 0x0);
+  fiat_p256_cmovznz_u32(&x35, x34, x17, x1);
+  fiat_p256_cmovznz_u32(&x36, x34, x19, x3);
+  fiat_p256_cmovznz_u32(&x37, x34, x21, x5);
+  fiat_p256_cmovznz_u32(&x38, x34, x23, x7);
+  fiat_p256_cmovznz_u32(&x39, x34, x25, x9);
+  fiat_p256_cmovznz_u32(&x40, x34, x27, x11);
+  fiat_p256_cmovznz_u32(&x41, x34, x29, x13);
   fiat_p256_cmovznz_u32(&x42, x34, x31, x15);
   out1[0] = x35;
   out1[1] = x36;
@@ -2221,6 +2310,7 @@
 
 /*
  * The function fiat_p256_sub subtracts two field elements in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  *   0 ≤ eval arg2 < m
@@ -2228,63 +2318,58 @@
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- *   arg2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_sub(uint32_t out1[8], const uint32_t arg1[8], const uint32_t arg2[8]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
   uint32_t x1;
   fiat_p256_uint1 x2;
-  fiat_p256_subborrowx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
   uint32_t x3;
   fiat_p256_uint1 x4;
-  fiat_p256_subborrowx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1]));
   uint32_t x5;
   fiat_p256_uint1 x6;
-  fiat_p256_subborrowx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2]));
   uint32_t x7;
   fiat_p256_uint1 x8;
-  fiat_p256_subborrowx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3]));
   uint32_t x9;
   fiat_p256_uint1 x10;
-  fiat_p256_subborrowx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4]));
   uint32_t x11;
   fiat_p256_uint1 x12;
-  fiat_p256_subborrowx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5]));
   uint32_t x13;
   fiat_p256_uint1 x14;
-  fiat_p256_subborrowx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6]));
   uint32_t x15;
   fiat_p256_uint1 x16;
-  fiat_p256_subborrowx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7]));
   uint32_t x17;
-  fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff));
   uint32_t x18;
   fiat_p256_uint1 x19;
-  fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, (x17 & UINT32_C(0xffffffff)));
   uint32_t x20;
   fiat_p256_uint1 x21;
-  fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, (x17 & UINT32_C(0xffffffff)));
   uint32_t x22;
   fiat_p256_uint1 x23;
-  fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, (x17 & UINT32_C(0xffffffff)));
   uint32_t x24;
   fiat_p256_uint1 x25;
-  fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0);
   uint32_t x26;
   fiat_p256_uint1 x27;
-  fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0);
   uint32_t x28;
   fiat_p256_uint1 x29;
-  fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0);
   uint32_t x30;
   fiat_p256_uint1 x31;
-  fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1));
   uint32_t x32;
   fiat_p256_uint1 x33;
-  fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, (x17 & UINT32_C(0xffffffff)));
+  fiat_p256_subborrowx_u32(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_subborrowx_u32(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_subborrowx_u32(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_subborrowx_u32(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_subborrowx_u32(&x9, &x10, x8, (arg1[4]), (arg2[4]));
+  fiat_p256_subborrowx_u32(&x11, &x12, x10, (arg1[5]), (arg2[5]));
+  fiat_p256_subborrowx_u32(&x13, &x14, x12, (arg1[6]), (arg2[6]));
+  fiat_p256_subborrowx_u32(&x15, &x16, x14, (arg1[7]), (arg2[7]));
+  fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17);
+  fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17);
+  fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17);
+  fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0);
+  fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0);
+  fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0);
+  fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1));
+  fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17);
   out1[0] = x18;
   out1[1] = x20;
   out1[2] = x22;
@@ -2297,68 +2382,65 @@
 
 /*
  * The function fiat_p256_opp negates a field element in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
  *   eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_opp(uint32_t out1[8], const uint32_t arg1[8]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
   uint32_t x1;
   fiat_p256_uint1 x2;
-  fiat_p256_subborrowx_u32(&x1, &x2, 0x0, 0x0, (arg1[0]));
   uint32_t x3;
   fiat_p256_uint1 x4;
-  fiat_p256_subborrowx_u32(&x3, &x4, x2, 0x0, (arg1[1]));
   uint32_t x5;
   fiat_p256_uint1 x6;
-  fiat_p256_subborrowx_u32(&x5, &x6, x4, 0x0, (arg1[2]));
   uint32_t x7;
   fiat_p256_uint1 x8;
-  fiat_p256_subborrowx_u32(&x7, &x8, x6, 0x0, (arg1[3]));
   uint32_t x9;
   fiat_p256_uint1 x10;
-  fiat_p256_subborrowx_u32(&x9, &x10, x8, 0x0, (arg1[4]));
   uint32_t x11;
   fiat_p256_uint1 x12;
-  fiat_p256_subborrowx_u32(&x11, &x12, x10, 0x0, (arg1[5]));
   uint32_t x13;
   fiat_p256_uint1 x14;
-  fiat_p256_subborrowx_u32(&x13, &x14, x12, 0x0, (arg1[6]));
   uint32_t x15;
   fiat_p256_uint1 x16;
-  fiat_p256_subborrowx_u32(&x15, &x16, x14, 0x0, (arg1[7]));
   uint32_t x17;
-  fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff));
   uint32_t x18;
   fiat_p256_uint1 x19;
-  fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, (x17 & UINT32_C(0xffffffff)));
   uint32_t x20;
   fiat_p256_uint1 x21;
-  fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, (x17 & UINT32_C(0xffffffff)));
   uint32_t x22;
   fiat_p256_uint1 x23;
-  fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, (x17 & UINT32_C(0xffffffff)));
   uint32_t x24;
   fiat_p256_uint1 x25;
-  fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0);
   uint32_t x26;
   fiat_p256_uint1 x27;
-  fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0);
   uint32_t x28;
   fiat_p256_uint1 x29;
-  fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0);
   uint32_t x30;
   fiat_p256_uint1 x31;
-  fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1));
   uint32_t x32;
   fiat_p256_uint1 x33;
-  fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, (x17 & UINT32_C(0xffffffff)));
+  fiat_p256_subborrowx_u32(&x1, &x2, 0x0, 0x0, (arg1[0]));
+  fiat_p256_subborrowx_u32(&x3, &x4, x2, 0x0, (arg1[1]));
+  fiat_p256_subborrowx_u32(&x5, &x6, x4, 0x0, (arg1[2]));
+  fiat_p256_subborrowx_u32(&x7, &x8, x6, 0x0, (arg1[3]));
+  fiat_p256_subborrowx_u32(&x9, &x10, x8, 0x0, (arg1[4]));
+  fiat_p256_subborrowx_u32(&x11, &x12, x10, 0x0, (arg1[5]));
+  fiat_p256_subborrowx_u32(&x13, &x14, x12, 0x0, (arg1[6]));
+  fiat_p256_subborrowx_u32(&x15, &x16, x14, 0x0, (arg1[7]));
+  fiat_p256_cmovznz_u32(&x17, x16, 0x0, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x18, &x19, 0x0, x1, x17);
+  fiat_p256_addcarryx_u32(&x20, &x21, x19, x3, x17);
+  fiat_p256_addcarryx_u32(&x22, &x23, x21, x5, x17);
+  fiat_p256_addcarryx_u32(&x24, &x25, x23, x7, 0x0);
+  fiat_p256_addcarryx_u32(&x26, &x27, x25, x9, 0x0);
+  fiat_p256_addcarryx_u32(&x28, &x29, x27, x11, 0x0);
+  fiat_p256_addcarryx_u32(&x30, &x31, x29, x13, (fiat_p256_uint1)(x17 & 0x1));
+  fiat_p256_addcarryx_u32(&x32, &x33, x31, x15, x17);
   out1[0] = x18;
   out1[1] = x20;
   out1[2] = x22;
@@ -2371,532 +2453,530 @@
 
 /*
  * The function fiat_p256_from_montgomery translates a field element out of the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
  *   eval out1 mod m = (eval arg1 * ((2^32)⁻¹ mod m)^8) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_from_montgomery(uint32_t out1[8], const uint32_t arg1[8]) {
-  uint32_t x1 = (arg1[0]);
+static FIAT_P256_FIAT_INLINE void fiat_p256_from_montgomery(fiat_p256_non_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint32_t x1;
   uint32_t x2;
   uint32_t x3;
-  fiat_p256_mulx_u32(&x2, &x3, x1, UINT32_C(0xffffffff));
   uint32_t x4;
   uint32_t x5;
-  fiat_p256_mulx_u32(&x4, &x5, x1, UINT32_C(0xffffffff));
   uint32_t x6;
   uint32_t x7;
-  fiat_p256_mulx_u32(&x6, &x7, x1, UINT32_C(0xffffffff));
   uint32_t x8;
   uint32_t x9;
-  fiat_p256_mulx_u32(&x8, &x9, x1, UINT32_C(0xffffffff));
   uint32_t x10;
   fiat_p256_uint1 x11;
-  fiat_p256_addcarryx_u32(&x10, &x11, 0x0, x9, x6);
   uint32_t x12;
   fiat_p256_uint1 x13;
-  fiat_p256_addcarryx_u32(&x12, &x13, x11, x7, x4);
   uint32_t x14;
   fiat_p256_uint1 x15;
-  fiat_p256_addcarryx_u32(&x14, &x15, 0x0, x1, x8);
   uint32_t x16;
   fiat_p256_uint1 x17;
-  fiat_p256_addcarryx_u32(&x16, &x17, x15, 0x0, x10);
   uint32_t x18;
   fiat_p256_uint1 x19;
-  fiat_p256_addcarryx_u32(&x18, &x19, x17, 0x0, x12);
   uint32_t x20;
   fiat_p256_uint1 x21;
-  fiat_p256_addcarryx_u32(&x20, &x21, x19, 0x0, (x13 + x5));
   uint32_t x22;
   fiat_p256_uint1 x23;
-  fiat_p256_addcarryx_u32(&x22, &x23, 0x0, x16, (arg1[1]));
   uint32_t x24;
   fiat_p256_uint1 x25;
-  fiat_p256_addcarryx_u32(&x24, &x25, x23, x18, 0x0);
   uint32_t x26;
   fiat_p256_uint1 x27;
-  fiat_p256_addcarryx_u32(&x26, &x27, x25, x20, 0x0);
   uint32_t x28;
   uint32_t x29;
-  fiat_p256_mulx_u32(&x28, &x29, x22, UINT32_C(0xffffffff));
   uint32_t x30;
   uint32_t x31;
-  fiat_p256_mulx_u32(&x30, &x31, x22, UINT32_C(0xffffffff));
   uint32_t x32;
   uint32_t x33;
-  fiat_p256_mulx_u32(&x32, &x33, x22, UINT32_C(0xffffffff));
   uint32_t x34;
   uint32_t x35;
-  fiat_p256_mulx_u32(&x34, &x35, x22, UINT32_C(0xffffffff));
   uint32_t x36;
   fiat_p256_uint1 x37;
-  fiat_p256_addcarryx_u32(&x36, &x37, 0x0, x35, x32);
   uint32_t x38;
   fiat_p256_uint1 x39;
-  fiat_p256_addcarryx_u32(&x38, &x39, x37, x33, x30);
   uint32_t x40;
   fiat_p256_uint1 x41;
-  fiat_p256_addcarryx_u32(&x40, &x41, 0x0, x22, x34);
   uint32_t x42;
   fiat_p256_uint1 x43;
-  fiat_p256_addcarryx_u32(&x42, &x43, x41, x24, x36);
   uint32_t x44;
   fiat_p256_uint1 x45;
-  fiat_p256_addcarryx_u32(&x44, &x45, x43, x26, x38);
   uint32_t x46;
   fiat_p256_uint1 x47;
-  fiat_p256_addcarryx_u32(&x46, &x47, x45, ((uint32_t)x27 + x21), (x39 + x31));
   uint32_t x48;
   fiat_p256_uint1 x49;
-  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x2, x22);
   uint32_t x50;
   fiat_p256_uint1 x51;
-  fiat_p256_addcarryx_u32(&x50, &x51, x49, x3, x28);
   uint32_t x52;
   fiat_p256_uint1 x53;
-  fiat_p256_addcarryx_u32(&x52, &x53, 0x0, x42, (arg1[2]));
   uint32_t x54;
   fiat_p256_uint1 x55;
-  fiat_p256_addcarryx_u32(&x54, &x55, x53, x44, 0x0);
   uint32_t x56;
   fiat_p256_uint1 x57;
-  fiat_p256_addcarryx_u32(&x56, &x57, x55, x46, 0x0);
   uint32_t x58;
   uint32_t x59;
-  fiat_p256_mulx_u32(&x58, &x59, x52, UINT32_C(0xffffffff));
   uint32_t x60;
   uint32_t x61;
-  fiat_p256_mulx_u32(&x60, &x61, x52, UINT32_C(0xffffffff));
   uint32_t x62;
   uint32_t x63;
-  fiat_p256_mulx_u32(&x62, &x63, x52, UINT32_C(0xffffffff));
   uint32_t x64;
   uint32_t x65;
-  fiat_p256_mulx_u32(&x64, &x65, x52, UINT32_C(0xffffffff));
   uint32_t x66;
   fiat_p256_uint1 x67;
-  fiat_p256_addcarryx_u32(&x66, &x67, 0x0, x65, x62);
   uint32_t x68;
   fiat_p256_uint1 x69;
-  fiat_p256_addcarryx_u32(&x68, &x69, x67, x63, x60);
   uint32_t x70;
   fiat_p256_uint1 x71;
-  fiat_p256_addcarryx_u32(&x70, &x71, 0x0, x52, x64);
   uint32_t x72;
   fiat_p256_uint1 x73;
-  fiat_p256_addcarryx_u32(&x72, &x73, x71, x54, x66);
   uint32_t x74;
   fiat_p256_uint1 x75;
-  fiat_p256_addcarryx_u32(&x74, &x75, x73, x56, x68);
   uint32_t x76;
   fiat_p256_uint1 x77;
-  fiat_p256_addcarryx_u32(&x76, &x77, x75, ((uint32_t)x57 + x47), (x69 + x61));
   uint32_t x78;
   fiat_p256_uint1 x79;
-  fiat_p256_addcarryx_u32(&x78, &x79, x77, x1, 0x0);
   uint32_t x80;
   fiat_p256_uint1 x81;
-  fiat_p256_addcarryx_u32(&x80, &x81, x79, x48, 0x0);
   uint32_t x82;
   fiat_p256_uint1 x83;
-  fiat_p256_addcarryx_u32(&x82, &x83, x81, x50, x52);
   uint32_t x84;
   fiat_p256_uint1 x85;
-  fiat_p256_addcarryx_u32(&x84, &x85, x83, (x51 + x29), x58);
   uint32_t x86;
   fiat_p256_uint1 x87;
-  fiat_p256_addcarryx_u32(&x86, &x87, 0x0, x72, (arg1[3]));
   uint32_t x88;
   fiat_p256_uint1 x89;
-  fiat_p256_addcarryx_u32(&x88, &x89, x87, x74, 0x0);
   uint32_t x90;
   fiat_p256_uint1 x91;
-  fiat_p256_addcarryx_u32(&x90, &x91, x89, x76, 0x0);
   uint32_t x92;
   fiat_p256_uint1 x93;
-  fiat_p256_addcarryx_u32(&x92, &x93, x91, x78, 0x0);
   uint32_t x94;
   fiat_p256_uint1 x95;
-  fiat_p256_addcarryx_u32(&x94, &x95, x93, x80, 0x0);
   uint32_t x96;
   fiat_p256_uint1 x97;
-  fiat_p256_addcarryx_u32(&x96, &x97, x95, x82, 0x0);
   uint32_t x98;
   fiat_p256_uint1 x99;
-  fiat_p256_addcarryx_u32(&x98, &x99, x97, x84, 0x0);
   uint32_t x100;
   fiat_p256_uint1 x101;
-  fiat_p256_addcarryx_u32(&x100, &x101, x99, (x85 + x59), 0x0);
   uint32_t x102;
   uint32_t x103;
-  fiat_p256_mulx_u32(&x102, &x103, x86, UINT32_C(0xffffffff));
   uint32_t x104;
   uint32_t x105;
-  fiat_p256_mulx_u32(&x104, &x105, x86, UINT32_C(0xffffffff));
   uint32_t x106;
   uint32_t x107;
-  fiat_p256_mulx_u32(&x106, &x107, x86, UINT32_C(0xffffffff));
   uint32_t x108;
   uint32_t x109;
-  fiat_p256_mulx_u32(&x108, &x109, x86, UINT32_C(0xffffffff));
   uint32_t x110;
   fiat_p256_uint1 x111;
-  fiat_p256_addcarryx_u32(&x110, &x111, 0x0, x109, x106);
   uint32_t x112;
   fiat_p256_uint1 x113;
-  fiat_p256_addcarryx_u32(&x112, &x113, x111, x107, x104);
   uint32_t x114;
   fiat_p256_uint1 x115;
-  fiat_p256_addcarryx_u32(&x114, &x115, 0x0, x86, x108);
   uint32_t x116;
   fiat_p256_uint1 x117;
-  fiat_p256_addcarryx_u32(&x116, &x117, x115, x88, x110);
   uint32_t x118;
   fiat_p256_uint1 x119;
-  fiat_p256_addcarryx_u32(&x118, &x119, x117, x90, x112);
   uint32_t x120;
   fiat_p256_uint1 x121;
-  fiat_p256_addcarryx_u32(&x120, &x121, x119, x92, (x113 + x105));
   uint32_t x122;
   fiat_p256_uint1 x123;
-  fiat_p256_addcarryx_u32(&x122, &x123, x121, x94, 0x0);
   uint32_t x124;
   fiat_p256_uint1 x125;
-  fiat_p256_addcarryx_u32(&x124, &x125, x123, x96, 0x0);
   uint32_t x126;
   fiat_p256_uint1 x127;
-  fiat_p256_addcarryx_u32(&x126, &x127, x125, x98, x86);
   uint32_t x128;
   fiat_p256_uint1 x129;
-  fiat_p256_addcarryx_u32(&x128, &x129, x127, x100, x102);
   uint32_t x130;
   fiat_p256_uint1 x131;
-  fiat_p256_addcarryx_u32(&x130, &x131, x129, x101, x103);
   uint32_t x132;
   fiat_p256_uint1 x133;
-  fiat_p256_addcarryx_u32(&x132, &x133, 0x0, x116, (arg1[4]));
   uint32_t x134;
   fiat_p256_uint1 x135;
-  fiat_p256_addcarryx_u32(&x134, &x135, x133, x118, 0x0);
   uint32_t x136;
   fiat_p256_uint1 x137;
-  fiat_p256_addcarryx_u32(&x136, &x137, x135, x120, 0x0);
   uint32_t x138;
   fiat_p256_uint1 x139;
-  fiat_p256_addcarryx_u32(&x138, &x139, x137, x122, 0x0);
   uint32_t x140;
   fiat_p256_uint1 x141;
-  fiat_p256_addcarryx_u32(&x140, &x141, x139, x124, 0x0);
   uint32_t x142;
   fiat_p256_uint1 x143;
-  fiat_p256_addcarryx_u32(&x142, &x143, x141, x126, 0x0);
   uint32_t x144;
   fiat_p256_uint1 x145;
-  fiat_p256_addcarryx_u32(&x144, &x145, x143, x128, 0x0);
   uint32_t x146;
   fiat_p256_uint1 x147;
-  fiat_p256_addcarryx_u32(&x146, &x147, x145, x130, 0x0);
   uint32_t x148;
   uint32_t x149;
-  fiat_p256_mulx_u32(&x148, &x149, x132, UINT32_C(0xffffffff));
   uint32_t x150;
   uint32_t x151;
-  fiat_p256_mulx_u32(&x150, &x151, x132, UINT32_C(0xffffffff));
   uint32_t x152;
   uint32_t x153;
-  fiat_p256_mulx_u32(&x152, &x153, x132, UINT32_C(0xffffffff));
   uint32_t x154;
   uint32_t x155;
-  fiat_p256_mulx_u32(&x154, &x155, x132, UINT32_C(0xffffffff));
   uint32_t x156;
   fiat_p256_uint1 x157;
-  fiat_p256_addcarryx_u32(&x156, &x157, 0x0, x155, x152);
   uint32_t x158;
   fiat_p256_uint1 x159;
-  fiat_p256_addcarryx_u32(&x158, &x159, x157, x153, x150);
   uint32_t x160;
   fiat_p256_uint1 x161;
-  fiat_p256_addcarryx_u32(&x160, &x161, 0x0, x132, x154);
   uint32_t x162;
   fiat_p256_uint1 x163;
-  fiat_p256_addcarryx_u32(&x162, &x163, x161, x134, x156);
   uint32_t x164;
   fiat_p256_uint1 x165;
-  fiat_p256_addcarryx_u32(&x164, &x165, x163, x136, x158);
   uint32_t x166;
   fiat_p256_uint1 x167;
-  fiat_p256_addcarryx_u32(&x166, &x167, x165, x138, (x159 + x151));
   uint32_t x168;
   fiat_p256_uint1 x169;
-  fiat_p256_addcarryx_u32(&x168, &x169, x167, x140, 0x0);
   uint32_t x170;
   fiat_p256_uint1 x171;
-  fiat_p256_addcarryx_u32(&x170, &x171, x169, x142, 0x0);
   uint32_t x172;
   fiat_p256_uint1 x173;
-  fiat_p256_addcarryx_u32(&x172, &x173, x171, x144, x132);
   uint32_t x174;
   fiat_p256_uint1 x175;
-  fiat_p256_addcarryx_u32(&x174, &x175, x173, x146, x148);
   uint32_t x176;
   fiat_p256_uint1 x177;
-  fiat_p256_addcarryx_u32(&x176, &x177, x175, ((uint32_t)x147 + x131), x149);
   uint32_t x178;
   fiat_p256_uint1 x179;
-  fiat_p256_addcarryx_u32(&x178, &x179, 0x0, x162, (arg1[5]));
   uint32_t x180;
   fiat_p256_uint1 x181;
-  fiat_p256_addcarryx_u32(&x180, &x181, x179, x164, 0x0);
   uint32_t x182;
   fiat_p256_uint1 x183;
-  fiat_p256_addcarryx_u32(&x182, &x183, x181, x166, 0x0);
   uint32_t x184;
   fiat_p256_uint1 x185;
-  fiat_p256_addcarryx_u32(&x184, &x185, x183, x168, 0x0);
   uint32_t x186;
   fiat_p256_uint1 x187;
-  fiat_p256_addcarryx_u32(&x186, &x187, x185, x170, 0x0);
   uint32_t x188;
   fiat_p256_uint1 x189;
-  fiat_p256_addcarryx_u32(&x188, &x189, x187, x172, 0x0);
   uint32_t x190;
   fiat_p256_uint1 x191;
-  fiat_p256_addcarryx_u32(&x190, &x191, x189, x174, 0x0);
   uint32_t x192;
   fiat_p256_uint1 x193;
-  fiat_p256_addcarryx_u32(&x192, &x193, x191, x176, 0x0);
   uint32_t x194;
   uint32_t x195;
-  fiat_p256_mulx_u32(&x194, &x195, x178, UINT32_C(0xffffffff));
   uint32_t x196;
   uint32_t x197;
-  fiat_p256_mulx_u32(&x196, &x197, x178, UINT32_C(0xffffffff));
   uint32_t x198;
   uint32_t x199;
-  fiat_p256_mulx_u32(&x198, &x199, x178, UINT32_C(0xffffffff));
   uint32_t x200;
   uint32_t x201;
-  fiat_p256_mulx_u32(&x200, &x201, x178, UINT32_C(0xffffffff));
   uint32_t x202;
   fiat_p256_uint1 x203;
-  fiat_p256_addcarryx_u32(&x202, &x203, 0x0, x201, x198);
   uint32_t x204;
   fiat_p256_uint1 x205;
-  fiat_p256_addcarryx_u32(&x204, &x205, x203, x199, x196);
   uint32_t x206;
   fiat_p256_uint1 x207;
-  fiat_p256_addcarryx_u32(&x206, &x207, 0x0, x178, x200);
   uint32_t x208;
   fiat_p256_uint1 x209;
-  fiat_p256_addcarryx_u32(&x208, &x209, x207, x180, x202);
   uint32_t x210;
   fiat_p256_uint1 x211;
-  fiat_p256_addcarryx_u32(&x210, &x211, x209, x182, x204);
   uint32_t x212;
   fiat_p256_uint1 x213;
-  fiat_p256_addcarryx_u32(&x212, &x213, x211, x184, (x205 + x197));
   uint32_t x214;
   fiat_p256_uint1 x215;
-  fiat_p256_addcarryx_u32(&x214, &x215, x213, x186, 0x0);
   uint32_t x216;
   fiat_p256_uint1 x217;
-  fiat_p256_addcarryx_u32(&x216, &x217, x215, x188, 0x0);
   uint32_t x218;
   fiat_p256_uint1 x219;
-  fiat_p256_addcarryx_u32(&x218, &x219, x217, x190, x178);
   uint32_t x220;
   fiat_p256_uint1 x221;
-  fiat_p256_addcarryx_u32(&x220, &x221, x219, x192, x194);
   uint32_t x222;
   fiat_p256_uint1 x223;
-  fiat_p256_addcarryx_u32(&x222, &x223, x221, ((uint32_t)x193 + x177), x195);
   uint32_t x224;
   fiat_p256_uint1 x225;
-  fiat_p256_addcarryx_u32(&x224, &x225, 0x0, x208, (arg1[6]));
   uint32_t x226;
   fiat_p256_uint1 x227;
-  fiat_p256_addcarryx_u32(&x226, &x227, x225, x210, 0x0);
   uint32_t x228;
   fiat_p256_uint1 x229;
-  fiat_p256_addcarryx_u32(&x228, &x229, x227, x212, 0x0);
   uint32_t x230;
   fiat_p256_uint1 x231;
-  fiat_p256_addcarryx_u32(&x230, &x231, x229, x214, 0x0);
   uint32_t x232;
   fiat_p256_uint1 x233;
-  fiat_p256_addcarryx_u32(&x232, &x233, x231, x216, 0x0);
   uint32_t x234;
   fiat_p256_uint1 x235;
-  fiat_p256_addcarryx_u32(&x234, &x235, x233, x218, 0x0);
   uint32_t x236;
   fiat_p256_uint1 x237;
-  fiat_p256_addcarryx_u32(&x236, &x237, x235, x220, 0x0);
   uint32_t x238;
   fiat_p256_uint1 x239;
-  fiat_p256_addcarryx_u32(&x238, &x239, x237, x222, 0x0);
   uint32_t x240;
   uint32_t x241;
-  fiat_p256_mulx_u32(&x240, &x241, x224, UINT32_C(0xffffffff));
   uint32_t x242;
   uint32_t x243;
-  fiat_p256_mulx_u32(&x242, &x243, x224, UINT32_C(0xffffffff));
   uint32_t x244;
   uint32_t x245;
-  fiat_p256_mulx_u32(&x244, &x245, x224, UINT32_C(0xffffffff));
   uint32_t x246;
   uint32_t x247;
-  fiat_p256_mulx_u32(&x246, &x247, x224, UINT32_C(0xffffffff));
   uint32_t x248;
   fiat_p256_uint1 x249;
-  fiat_p256_addcarryx_u32(&x248, &x249, 0x0, x247, x244);
   uint32_t x250;
   fiat_p256_uint1 x251;
-  fiat_p256_addcarryx_u32(&x250, &x251, x249, x245, x242);
   uint32_t x252;
   fiat_p256_uint1 x253;
-  fiat_p256_addcarryx_u32(&x252, &x253, 0x0, x224, x246);
   uint32_t x254;
   fiat_p256_uint1 x255;
-  fiat_p256_addcarryx_u32(&x254, &x255, x253, x226, x248);
   uint32_t x256;
   fiat_p256_uint1 x257;
-  fiat_p256_addcarryx_u32(&x256, &x257, x255, x228, x250);
   uint32_t x258;
   fiat_p256_uint1 x259;
-  fiat_p256_addcarryx_u32(&x258, &x259, x257, x230, (x251 + x243));
   uint32_t x260;
   fiat_p256_uint1 x261;
-  fiat_p256_addcarryx_u32(&x260, &x261, x259, x232, 0x0);
   uint32_t x262;
   fiat_p256_uint1 x263;
-  fiat_p256_addcarryx_u32(&x262, &x263, x261, x234, 0x0);
   uint32_t x264;
   fiat_p256_uint1 x265;
-  fiat_p256_addcarryx_u32(&x264, &x265, x263, x236, x224);
   uint32_t x266;
   fiat_p256_uint1 x267;
-  fiat_p256_addcarryx_u32(&x266, &x267, x265, x238, x240);
   uint32_t x268;
   fiat_p256_uint1 x269;
-  fiat_p256_addcarryx_u32(&x268, &x269, x267, ((uint32_t)x239 + x223), x241);
   uint32_t x270;
   fiat_p256_uint1 x271;
-  fiat_p256_addcarryx_u32(&x270, &x271, 0x0, x254, (arg1[7]));
   uint32_t x272;
   fiat_p256_uint1 x273;
-  fiat_p256_addcarryx_u32(&x272, &x273, x271, x256, 0x0);
   uint32_t x274;
   fiat_p256_uint1 x275;
-  fiat_p256_addcarryx_u32(&x274, &x275, x273, x258, 0x0);
   uint32_t x276;
   fiat_p256_uint1 x277;
-  fiat_p256_addcarryx_u32(&x276, &x277, x275, x260, 0x0);
   uint32_t x278;
   fiat_p256_uint1 x279;
-  fiat_p256_addcarryx_u32(&x278, &x279, x277, x262, 0x0);
   uint32_t x280;
   fiat_p256_uint1 x281;
-  fiat_p256_addcarryx_u32(&x280, &x281, x279, x264, 0x0);
   uint32_t x282;
   fiat_p256_uint1 x283;
-  fiat_p256_addcarryx_u32(&x282, &x283, x281, x266, 0x0);
   uint32_t x284;
   fiat_p256_uint1 x285;
-  fiat_p256_addcarryx_u32(&x284, &x285, x283, x268, 0x0);
   uint32_t x286;
   uint32_t x287;
-  fiat_p256_mulx_u32(&x286, &x287, x270, UINT32_C(0xffffffff));
   uint32_t x288;
   uint32_t x289;
-  fiat_p256_mulx_u32(&x288, &x289, x270, UINT32_C(0xffffffff));
   uint32_t x290;
   uint32_t x291;
-  fiat_p256_mulx_u32(&x290, &x291, x270, UINT32_C(0xffffffff));
   uint32_t x292;
   uint32_t x293;
-  fiat_p256_mulx_u32(&x292, &x293, x270, UINT32_C(0xffffffff));
   uint32_t x294;
   fiat_p256_uint1 x295;
-  fiat_p256_addcarryx_u32(&x294, &x295, 0x0, x293, x290);
   uint32_t x296;
   fiat_p256_uint1 x297;
-  fiat_p256_addcarryx_u32(&x296, &x297, x295, x291, x288);
   uint32_t x298;
   fiat_p256_uint1 x299;
-  fiat_p256_addcarryx_u32(&x298, &x299, 0x0, x270, x292);
   uint32_t x300;
   fiat_p256_uint1 x301;
-  fiat_p256_addcarryx_u32(&x300, &x301, x299, x272, x294);
   uint32_t x302;
   fiat_p256_uint1 x303;
-  fiat_p256_addcarryx_u32(&x302, &x303, x301, x274, x296);
   uint32_t x304;
   fiat_p256_uint1 x305;
-  fiat_p256_addcarryx_u32(&x304, &x305, x303, x276, (x297 + x289));
   uint32_t x306;
   fiat_p256_uint1 x307;
-  fiat_p256_addcarryx_u32(&x306, &x307, x305, x278, 0x0);
   uint32_t x308;
   fiat_p256_uint1 x309;
-  fiat_p256_addcarryx_u32(&x308, &x309, x307, x280, 0x0);
   uint32_t x310;
   fiat_p256_uint1 x311;
-  fiat_p256_addcarryx_u32(&x310, &x311, x309, x282, x270);
   uint32_t x312;
   fiat_p256_uint1 x313;
-  fiat_p256_addcarryx_u32(&x312, &x313, x311, x284, x286);
   uint32_t x314;
   fiat_p256_uint1 x315;
-  fiat_p256_addcarryx_u32(&x314, &x315, x313, ((uint32_t)x285 + x269), x287);
   uint32_t x316;
   fiat_p256_uint1 x317;
-  fiat_p256_subborrowx_u32(&x316, &x317, 0x0, x300, UINT32_C(0xffffffff));
   uint32_t x318;
   fiat_p256_uint1 x319;
-  fiat_p256_subborrowx_u32(&x318, &x319, x317, x302, UINT32_C(0xffffffff));
   uint32_t x320;
   fiat_p256_uint1 x321;
-  fiat_p256_subborrowx_u32(&x320, &x321, x319, x304, UINT32_C(0xffffffff));
   uint32_t x322;
   fiat_p256_uint1 x323;
-  fiat_p256_subborrowx_u32(&x322, &x323, x321, x306, 0x0);
   uint32_t x324;
   fiat_p256_uint1 x325;
-  fiat_p256_subborrowx_u32(&x324, &x325, x323, x308, 0x0);
   uint32_t x326;
   fiat_p256_uint1 x327;
-  fiat_p256_subborrowx_u32(&x326, &x327, x325, x310, 0x0);
   uint32_t x328;
   fiat_p256_uint1 x329;
-  fiat_p256_subborrowx_u32(&x328, &x329, x327, x312, 0x1);
   uint32_t x330;
   fiat_p256_uint1 x331;
-  fiat_p256_subborrowx_u32(&x330, &x331, x329, x314, UINT32_C(0xffffffff));
   uint32_t x332;
   fiat_p256_uint1 x333;
-  fiat_p256_subborrowx_u32(&x332, &x333, x331, x315, 0x0);
   uint32_t x334;
-  fiat_p256_cmovznz_u32(&x334, x333, x316, x300);
   uint32_t x335;
-  fiat_p256_cmovznz_u32(&x335, x333, x318, x302);
   uint32_t x336;
-  fiat_p256_cmovznz_u32(&x336, x333, x320, x304);
   uint32_t x337;
-  fiat_p256_cmovznz_u32(&x337, x333, x322, x306);
   uint32_t x338;
-  fiat_p256_cmovznz_u32(&x338, x333, x324, x308);
   uint32_t x339;
-  fiat_p256_cmovznz_u32(&x339, x333, x326, x310);
   uint32_t x340;
-  fiat_p256_cmovznz_u32(&x340, x333, x328, x312);
   uint32_t x341;
+  x1 = (arg1[0]);
+  fiat_p256_mulx_u32(&x2, &x3, x1, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x4, &x5, x1, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x6, &x7, x1, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x8, &x9, x1, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x10, &x11, 0x0, x9, x6);
+  fiat_p256_addcarryx_u32(&x12, &x13, x11, x7, x4);
+  fiat_p256_addcarryx_u32(&x14, &x15, 0x0, x1, x8);
+  fiat_p256_addcarryx_u32(&x16, &x17, x15, 0x0, x10);
+  fiat_p256_addcarryx_u32(&x18, &x19, x17, 0x0, x12);
+  fiat_p256_addcarryx_u32(&x20, &x21, x19, 0x0, (x13 + x5));
+  fiat_p256_addcarryx_u32(&x22, &x23, 0x0, x16, (arg1[1]));
+  fiat_p256_addcarryx_u32(&x24, &x25, x23, x18, 0x0);
+  fiat_p256_addcarryx_u32(&x26, &x27, x25, x20, 0x0);
+  fiat_p256_mulx_u32(&x28, &x29, x22, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x30, &x31, x22, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x32, &x33, x22, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x34, &x35, x22, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x36, &x37, 0x0, x35, x32);
+  fiat_p256_addcarryx_u32(&x38, &x39, x37, x33, x30);
+  fiat_p256_addcarryx_u32(&x40, &x41, 0x0, x22, x34);
+  fiat_p256_addcarryx_u32(&x42, &x43, x41, x24, x36);
+  fiat_p256_addcarryx_u32(&x44, &x45, x43, x26, x38);
+  fiat_p256_addcarryx_u32(&x46, &x47, x45, ((uint32_t)x27 + x21), (x39 + x31));
+  fiat_p256_addcarryx_u32(&x48, &x49, 0x0, x2, x22);
+  fiat_p256_addcarryx_u32(&x50, &x51, x49, x3, x28);
+  fiat_p256_addcarryx_u32(&x52, &x53, 0x0, x42, (arg1[2]));
+  fiat_p256_addcarryx_u32(&x54, &x55, x53, x44, 0x0);
+  fiat_p256_addcarryx_u32(&x56, &x57, x55, x46, 0x0);
+  fiat_p256_mulx_u32(&x58, &x59, x52, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x60, &x61, x52, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x62, &x63, x52, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x64, &x65, x52, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x66, &x67, 0x0, x65, x62);
+  fiat_p256_addcarryx_u32(&x68, &x69, x67, x63, x60);
+  fiat_p256_addcarryx_u32(&x70, &x71, 0x0, x52, x64);
+  fiat_p256_addcarryx_u32(&x72, &x73, x71, x54, x66);
+  fiat_p256_addcarryx_u32(&x74, &x75, x73, x56, x68);
+  fiat_p256_addcarryx_u32(&x76, &x77, x75, ((uint32_t)x57 + x47), (x69 + x61));
+  fiat_p256_addcarryx_u32(&x78, &x79, x77, x1, 0x0);
+  fiat_p256_addcarryx_u32(&x80, &x81, x79, x48, 0x0);
+  fiat_p256_addcarryx_u32(&x82, &x83, x81, x50, x52);
+  fiat_p256_addcarryx_u32(&x84, &x85, x83, (x51 + x29), x58);
+  fiat_p256_addcarryx_u32(&x86, &x87, 0x0, x72, (arg1[3]));
+  fiat_p256_addcarryx_u32(&x88, &x89, x87, x74, 0x0);
+  fiat_p256_addcarryx_u32(&x90, &x91, x89, x76, 0x0);
+  fiat_p256_addcarryx_u32(&x92, &x93, x91, x78, 0x0);
+  fiat_p256_addcarryx_u32(&x94, &x95, x93, x80, 0x0);
+  fiat_p256_addcarryx_u32(&x96, &x97, x95, x82, 0x0);
+  fiat_p256_addcarryx_u32(&x98, &x99, x97, x84, 0x0);
+  fiat_p256_addcarryx_u32(&x100, &x101, x99, (x85 + x59), 0x0);
+  fiat_p256_mulx_u32(&x102, &x103, x86, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x104, &x105, x86, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x106, &x107, x86, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x108, &x109, x86, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x110, &x111, 0x0, x109, x106);
+  fiat_p256_addcarryx_u32(&x112, &x113, x111, x107, x104);
+  fiat_p256_addcarryx_u32(&x114, &x115, 0x0, x86, x108);
+  fiat_p256_addcarryx_u32(&x116, &x117, x115, x88, x110);
+  fiat_p256_addcarryx_u32(&x118, &x119, x117, x90, x112);
+  fiat_p256_addcarryx_u32(&x120, &x121, x119, x92, (x113 + x105));
+  fiat_p256_addcarryx_u32(&x122, &x123, x121, x94, 0x0);
+  fiat_p256_addcarryx_u32(&x124, &x125, x123, x96, 0x0);
+  fiat_p256_addcarryx_u32(&x126, &x127, x125, x98, x86);
+  fiat_p256_addcarryx_u32(&x128, &x129, x127, x100, x102);
+  fiat_p256_addcarryx_u32(&x130, &x131, x129, x101, x103);
+  fiat_p256_addcarryx_u32(&x132, &x133, 0x0, x116, (arg1[4]));
+  fiat_p256_addcarryx_u32(&x134, &x135, x133, x118, 0x0);
+  fiat_p256_addcarryx_u32(&x136, &x137, x135, x120, 0x0);
+  fiat_p256_addcarryx_u32(&x138, &x139, x137, x122, 0x0);
+  fiat_p256_addcarryx_u32(&x140, &x141, x139, x124, 0x0);
+  fiat_p256_addcarryx_u32(&x142, &x143, x141, x126, 0x0);
+  fiat_p256_addcarryx_u32(&x144, &x145, x143, x128, 0x0);
+  fiat_p256_addcarryx_u32(&x146, &x147, x145, x130, 0x0);
+  fiat_p256_mulx_u32(&x148, &x149, x132, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x150, &x151, x132, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x152, &x153, x132, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x154, &x155, x132, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x156, &x157, 0x0, x155, x152);
+  fiat_p256_addcarryx_u32(&x158, &x159, x157, x153, x150);
+  fiat_p256_addcarryx_u32(&x160, &x161, 0x0, x132, x154);
+  fiat_p256_addcarryx_u32(&x162, &x163, x161, x134, x156);
+  fiat_p256_addcarryx_u32(&x164, &x165, x163, x136, x158);
+  fiat_p256_addcarryx_u32(&x166, &x167, x165, x138, (x159 + x151));
+  fiat_p256_addcarryx_u32(&x168, &x169, x167, x140, 0x0);
+  fiat_p256_addcarryx_u32(&x170, &x171, x169, x142, 0x0);
+  fiat_p256_addcarryx_u32(&x172, &x173, x171, x144, x132);
+  fiat_p256_addcarryx_u32(&x174, &x175, x173, x146, x148);
+  fiat_p256_addcarryx_u32(&x176, &x177, x175, ((uint32_t)x147 + x131), x149);
+  fiat_p256_addcarryx_u32(&x178, &x179, 0x0, x162, (arg1[5]));
+  fiat_p256_addcarryx_u32(&x180, &x181, x179, x164, 0x0);
+  fiat_p256_addcarryx_u32(&x182, &x183, x181, x166, 0x0);
+  fiat_p256_addcarryx_u32(&x184, &x185, x183, x168, 0x0);
+  fiat_p256_addcarryx_u32(&x186, &x187, x185, x170, 0x0);
+  fiat_p256_addcarryx_u32(&x188, &x189, x187, x172, 0x0);
+  fiat_p256_addcarryx_u32(&x190, &x191, x189, x174, 0x0);
+  fiat_p256_addcarryx_u32(&x192, &x193, x191, x176, 0x0);
+  fiat_p256_mulx_u32(&x194, &x195, x178, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x196, &x197, x178, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x198, &x199, x178, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x200, &x201, x178, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x202, &x203, 0x0, x201, x198);
+  fiat_p256_addcarryx_u32(&x204, &x205, x203, x199, x196);
+  fiat_p256_addcarryx_u32(&x206, &x207, 0x0, x178, x200);
+  fiat_p256_addcarryx_u32(&x208, &x209, x207, x180, x202);
+  fiat_p256_addcarryx_u32(&x210, &x211, x209, x182, x204);
+  fiat_p256_addcarryx_u32(&x212, &x213, x211, x184, (x205 + x197));
+  fiat_p256_addcarryx_u32(&x214, &x215, x213, x186, 0x0);
+  fiat_p256_addcarryx_u32(&x216, &x217, x215, x188, 0x0);
+  fiat_p256_addcarryx_u32(&x218, &x219, x217, x190, x178);
+  fiat_p256_addcarryx_u32(&x220, &x221, x219, x192, x194);
+  fiat_p256_addcarryx_u32(&x222, &x223, x221, ((uint32_t)x193 + x177), x195);
+  fiat_p256_addcarryx_u32(&x224, &x225, 0x0, x208, (arg1[6]));
+  fiat_p256_addcarryx_u32(&x226, &x227, x225, x210, 0x0);
+  fiat_p256_addcarryx_u32(&x228, &x229, x227, x212, 0x0);
+  fiat_p256_addcarryx_u32(&x230, &x231, x229, x214, 0x0);
+  fiat_p256_addcarryx_u32(&x232, &x233, x231, x216, 0x0);
+  fiat_p256_addcarryx_u32(&x234, &x235, x233, x218, 0x0);
+  fiat_p256_addcarryx_u32(&x236, &x237, x235, x220, 0x0);
+  fiat_p256_addcarryx_u32(&x238, &x239, x237, x222, 0x0);
+  fiat_p256_mulx_u32(&x240, &x241, x224, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x242, &x243, x224, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x244, &x245, x224, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x246, &x247, x224, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x248, &x249, 0x0, x247, x244);
+  fiat_p256_addcarryx_u32(&x250, &x251, x249, x245, x242);
+  fiat_p256_addcarryx_u32(&x252, &x253, 0x0, x224, x246);
+  fiat_p256_addcarryx_u32(&x254, &x255, x253, x226, x248);
+  fiat_p256_addcarryx_u32(&x256, &x257, x255, x228, x250);
+  fiat_p256_addcarryx_u32(&x258, &x259, x257, x230, (x251 + x243));
+  fiat_p256_addcarryx_u32(&x260, &x261, x259, x232, 0x0);
+  fiat_p256_addcarryx_u32(&x262, &x263, x261, x234, 0x0);
+  fiat_p256_addcarryx_u32(&x264, &x265, x263, x236, x224);
+  fiat_p256_addcarryx_u32(&x266, &x267, x265, x238, x240);
+  fiat_p256_addcarryx_u32(&x268, &x269, x267, ((uint32_t)x239 + x223), x241);
+  fiat_p256_addcarryx_u32(&x270, &x271, 0x0, x254, (arg1[7]));
+  fiat_p256_addcarryx_u32(&x272, &x273, x271, x256, 0x0);
+  fiat_p256_addcarryx_u32(&x274, &x275, x273, x258, 0x0);
+  fiat_p256_addcarryx_u32(&x276, &x277, x275, x260, 0x0);
+  fiat_p256_addcarryx_u32(&x278, &x279, x277, x262, 0x0);
+  fiat_p256_addcarryx_u32(&x280, &x281, x279, x264, 0x0);
+  fiat_p256_addcarryx_u32(&x282, &x283, x281, x266, 0x0);
+  fiat_p256_addcarryx_u32(&x284, &x285, x283, x268, 0x0);
+  fiat_p256_mulx_u32(&x286, &x287, x270, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x288, &x289, x270, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x290, &x291, x270, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x292, &x293, x270, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x294, &x295, 0x0, x293, x290);
+  fiat_p256_addcarryx_u32(&x296, &x297, x295, x291, x288);
+  fiat_p256_addcarryx_u32(&x298, &x299, 0x0, x270, x292);
+  fiat_p256_addcarryx_u32(&x300, &x301, x299, x272, x294);
+  fiat_p256_addcarryx_u32(&x302, &x303, x301, x274, x296);
+  fiat_p256_addcarryx_u32(&x304, &x305, x303, x276, (x297 + x289));
+  fiat_p256_addcarryx_u32(&x306, &x307, x305, x278, 0x0);
+  fiat_p256_addcarryx_u32(&x308, &x309, x307, x280, 0x0);
+  fiat_p256_addcarryx_u32(&x310, &x311, x309, x282, x270);
+  fiat_p256_addcarryx_u32(&x312, &x313, x311, x284, x286);
+  fiat_p256_addcarryx_u32(&x314, &x315, x313, ((uint32_t)x285 + x269), x287);
+  fiat_p256_subborrowx_u32(&x316, &x317, 0x0, x300, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x318, &x319, x317, x302, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x320, &x321, x319, x304, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x322, &x323, x321, x306, 0x0);
+  fiat_p256_subborrowx_u32(&x324, &x325, x323, x308, 0x0);
+  fiat_p256_subborrowx_u32(&x326, &x327, x325, x310, 0x0);
+  fiat_p256_subborrowx_u32(&x328, &x329, x327, x312, 0x1);
+  fiat_p256_subborrowx_u32(&x330, &x331, x329, x314, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x332, &x333, x331, x315, 0x0);
+  fiat_p256_cmovznz_u32(&x334, x333, x316, x300);
+  fiat_p256_cmovznz_u32(&x335, x333, x318, x302);
+  fiat_p256_cmovznz_u32(&x336, x333, x320, x304);
+  fiat_p256_cmovznz_u32(&x337, x333, x322, x306);
+  fiat_p256_cmovznz_u32(&x338, x333, x324, x308);
+  fiat_p256_cmovznz_u32(&x339, x333, x326, x310);
+  fiat_p256_cmovznz_u32(&x340, x333, x328, x312);
   fiat_p256_cmovznz_u32(&x341, x333, x330, x314);
   out1[0] = x334;
   out1[1] = x335;
@@ -2909,7 +2989,904 @@
 }
 
 /*
+ * The function fiat_p256_to_montgomery translates a field element into the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = eval arg1 mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_to_montgomery(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_non_montgomery_domain_field_element arg1) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint32_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  uint32_t x23;
+  fiat_p256_uint1 x24;
+  uint32_t x25;
+  fiat_p256_uint1 x26;
+  uint32_t x27;
+  fiat_p256_uint1 x28;
+  uint32_t x29;
+  fiat_p256_uint1 x30;
+  uint32_t x31;
+  fiat_p256_uint1 x32;
+  uint32_t x33;
+  uint32_t x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint32_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  fiat_p256_uint1 x42;
+  uint32_t x43;
+  fiat_p256_uint1 x44;
+  uint32_t x45;
+  fiat_p256_uint1 x46;
+  uint32_t x47;
+  fiat_p256_uint1 x48;
+  uint32_t x49;
+  fiat_p256_uint1 x50;
+  uint32_t x51;
+  fiat_p256_uint1 x52;
+  uint32_t x53;
+  fiat_p256_uint1 x54;
+  uint32_t x55;
+  fiat_p256_uint1 x56;
+  uint32_t x57;
+  fiat_p256_uint1 x58;
+  uint32_t x59;
+  fiat_p256_uint1 x60;
+  uint32_t x61;
+  fiat_p256_uint1 x62;
+  uint32_t x63;
+  uint32_t x64;
+  uint32_t x65;
+  uint32_t x66;
+  uint32_t x67;
+  uint32_t x68;
+  uint32_t x69;
+  uint32_t x70;
+  uint32_t x71;
+  uint32_t x72;
+  uint32_t x73;
+  uint32_t x74;
+  uint32_t x75;
+  uint32_t x76;
+  uint32_t x77;
+  fiat_p256_uint1 x78;
+  uint32_t x79;
+  fiat_p256_uint1 x80;
+  uint32_t x81;
+  fiat_p256_uint1 x82;
+  uint32_t x83;
+  fiat_p256_uint1 x84;
+  uint32_t x85;
+  fiat_p256_uint1 x86;
+  uint32_t x87;
+  fiat_p256_uint1 x88;
+  uint32_t x89;
+  fiat_p256_uint1 x90;
+  uint32_t x91;
+  fiat_p256_uint1 x92;
+  uint32_t x93;
+  fiat_p256_uint1 x94;
+  uint32_t x95;
+  fiat_p256_uint1 x96;
+  uint32_t x97;
+  fiat_p256_uint1 x98;
+  uint32_t x99;
+  fiat_p256_uint1 x100;
+  uint32_t x101;
+  fiat_p256_uint1 x102;
+  uint32_t x103;
+  uint32_t x104;
+  uint32_t x105;
+  uint32_t x106;
+  uint32_t x107;
+  uint32_t x108;
+  uint32_t x109;
+  uint32_t x110;
+  uint32_t x111;
+  fiat_p256_uint1 x112;
+  uint32_t x113;
+  fiat_p256_uint1 x114;
+  uint32_t x115;
+  fiat_p256_uint1 x116;
+  uint32_t x117;
+  fiat_p256_uint1 x118;
+  uint32_t x119;
+  fiat_p256_uint1 x120;
+  uint32_t x121;
+  fiat_p256_uint1 x122;
+  uint32_t x123;
+  fiat_p256_uint1 x124;
+  uint32_t x125;
+  fiat_p256_uint1 x126;
+  uint32_t x127;
+  fiat_p256_uint1 x128;
+  uint32_t x129;
+  fiat_p256_uint1 x130;
+  uint32_t x131;
+  fiat_p256_uint1 x132;
+  uint32_t x133;
+  uint32_t x134;
+  uint32_t x135;
+  uint32_t x136;
+  uint32_t x137;
+  uint32_t x138;
+  uint32_t x139;
+  uint32_t x140;
+  uint32_t x141;
+  uint32_t x142;
+  uint32_t x143;
+  uint32_t x144;
+  uint32_t x145;
+  uint32_t x146;
+  uint32_t x147;
+  fiat_p256_uint1 x148;
+  uint32_t x149;
+  fiat_p256_uint1 x150;
+  uint32_t x151;
+  fiat_p256_uint1 x152;
+  uint32_t x153;
+  fiat_p256_uint1 x154;
+  uint32_t x155;
+  fiat_p256_uint1 x156;
+  uint32_t x157;
+  fiat_p256_uint1 x158;
+  uint32_t x159;
+  fiat_p256_uint1 x160;
+  uint32_t x161;
+  fiat_p256_uint1 x162;
+  uint32_t x163;
+  fiat_p256_uint1 x164;
+  uint32_t x165;
+  fiat_p256_uint1 x166;
+  uint32_t x167;
+  fiat_p256_uint1 x168;
+  uint32_t x169;
+  fiat_p256_uint1 x170;
+  uint32_t x171;
+  fiat_p256_uint1 x172;
+  uint32_t x173;
+  uint32_t x174;
+  uint32_t x175;
+  uint32_t x176;
+  uint32_t x177;
+  uint32_t x178;
+  uint32_t x179;
+  uint32_t x180;
+  uint32_t x181;
+  fiat_p256_uint1 x182;
+  uint32_t x183;
+  fiat_p256_uint1 x184;
+  uint32_t x185;
+  fiat_p256_uint1 x186;
+  uint32_t x187;
+  fiat_p256_uint1 x188;
+  uint32_t x189;
+  fiat_p256_uint1 x190;
+  uint32_t x191;
+  fiat_p256_uint1 x192;
+  uint32_t x193;
+  fiat_p256_uint1 x194;
+  uint32_t x195;
+  fiat_p256_uint1 x196;
+  uint32_t x197;
+  fiat_p256_uint1 x198;
+  uint32_t x199;
+  fiat_p256_uint1 x200;
+  uint32_t x201;
+  fiat_p256_uint1 x202;
+  uint32_t x203;
+  uint32_t x204;
+  uint32_t x205;
+  uint32_t x206;
+  uint32_t x207;
+  uint32_t x208;
+  uint32_t x209;
+  uint32_t x210;
+  uint32_t x211;
+  uint32_t x212;
+  uint32_t x213;
+  uint32_t x214;
+  uint32_t x215;
+  uint32_t x216;
+  uint32_t x217;
+  fiat_p256_uint1 x218;
+  uint32_t x219;
+  fiat_p256_uint1 x220;
+  uint32_t x221;
+  fiat_p256_uint1 x222;
+  uint32_t x223;
+  fiat_p256_uint1 x224;
+  uint32_t x225;
+  fiat_p256_uint1 x226;
+  uint32_t x227;
+  fiat_p256_uint1 x228;
+  uint32_t x229;
+  fiat_p256_uint1 x230;
+  uint32_t x231;
+  fiat_p256_uint1 x232;
+  uint32_t x233;
+  fiat_p256_uint1 x234;
+  uint32_t x235;
+  fiat_p256_uint1 x236;
+  uint32_t x237;
+  fiat_p256_uint1 x238;
+  uint32_t x239;
+  fiat_p256_uint1 x240;
+  uint32_t x241;
+  fiat_p256_uint1 x242;
+  uint32_t x243;
+  uint32_t x244;
+  uint32_t x245;
+  uint32_t x246;
+  uint32_t x247;
+  uint32_t x248;
+  uint32_t x249;
+  uint32_t x250;
+  uint32_t x251;
+  fiat_p256_uint1 x252;
+  uint32_t x253;
+  fiat_p256_uint1 x254;
+  uint32_t x255;
+  fiat_p256_uint1 x256;
+  uint32_t x257;
+  fiat_p256_uint1 x258;
+  uint32_t x259;
+  fiat_p256_uint1 x260;
+  uint32_t x261;
+  fiat_p256_uint1 x262;
+  uint32_t x263;
+  fiat_p256_uint1 x264;
+  uint32_t x265;
+  fiat_p256_uint1 x266;
+  uint32_t x267;
+  fiat_p256_uint1 x268;
+  uint32_t x269;
+  fiat_p256_uint1 x270;
+  uint32_t x271;
+  fiat_p256_uint1 x272;
+  uint32_t x273;
+  uint32_t x274;
+  uint32_t x275;
+  uint32_t x276;
+  uint32_t x277;
+  uint32_t x278;
+  uint32_t x279;
+  uint32_t x280;
+  uint32_t x281;
+  uint32_t x282;
+  uint32_t x283;
+  uint32_t x284;
+  uint32_t x285;
+  uint32_t x286;
+  uint32_t x287;
+  fiat_p256_uint1 x288;
+  uint32_t x289;
+  fiat_p256_uint1 x290;
+  uint32_t x291;
+  fiat_p256_uint1 x292;
+  uint32_t x293;
+  fiat_p256_uint1 x294;
+  uint32_t x295;
+  fiat_p256_uint1 x296;
+  uint32_t x297;
+  fiat_p256_uint1 x298;
+  uint32_t x299;
+  fiat_p256_uint1 x300;
+  uint32_t x301;
+  fiat_p256_uint1 x302;
+  uint32_t x303;
+  fiat_p256_uint1 x304;
+  uint32_t x305;
+  fiat_p256_uint1 x306;
+  uint32_t x307;
+  fiat_p256_uint1 x308;
+  uint32_t x309;
+  fiat_p256_uint1 x310;
+  uint32_t x311;
+  fiat_p256_uint1 x312;
+  uint32_t x313;
+  uint32_t x314;
+  uint32_t x315;
+  uint32_t x316;
+  uint32_t x317;
+  uint32_t x318;
+  uint32_t x319;
+  uint32_t x320;
+  uint32_t x321;
+  fiat_p256_uint1 x322;
+  uint32_t x323;
+  fiat_p256_uint1 x324;
+  uint32_t x325;
+  fiat_p256_uint1 x326;
+  uint32_t x327;
+  fiat_p256_uint1 x328;
+  uint32_t x329;
+  fiat_p256_uint1 x330;
+  uint32_t x331;
+  fiat_p256_uint1 x332;
+  uint32_t x333;
+  fiat_p256_uint1 x334;
+  uint32_t x335;
+  fiat_p256_uint1 x336;
+  uint32_t x337;
+  fiat_p256_uint1 x338;
+  uint32_t x339;
+  fiat_p256_uint1 x340;
+  uint32_t x341;
+  fiat_p256_uint1 x342;
+  uint32_t x343;
+  uint32_t x344;
+  uint32_t x345;
+  uint32_t x346;
+  uint32_t x347;
+  uint32_t x348;
+  uint32_t x349;
+  uint32_t x350;
+  uint32_t x351;
+  uint32_t x352;
+  uint32_t x353;
+  uint32_t x354;
+  uint32_t x355;
+  uint32_t x356;
+  uint32_t x357;
+  fiat_p256_uint1 x358;
+  uint32_t x359;
+  fiat_p256_uint1 x360;
+  uint32_t x361;
+  fiat_p256_uint1 x362;
+  uint32_t x363;
+  fiat_p256_uint1 x364;
+  uint32_t x365;
+  fiat_p256_uint1 x366;
+  uint32_t x367;
+  fiat_p256_uint1 x368;
+  uint32_t x369;
+  fiat_p256_uint1 x370;
+  uint32_t x371;
+  fiat_p256_uint1 x372;
+  uint32_t x373;
+  fiat_p256_uint1 x374;
+  uint32_t x375;
+  fiat_p256_uint1 x376;
+  uint32_t x377;
+  fiat_p256_uint1 x378;
+  uint32_t x379;
+  fiat_p256_uint1 x380;
+  uint32_t x381;
+  fiat_p256_uint1 x382;
+  uint32_t x383;
+  uint32_t x384;
+  uint32_t x385;
+  uint32_t x386;
+  uint32_t x387;
+  uint32_t x388;
+  uint32_t x389;
+  uint32_t x390;
+  uint32_t x391;
+  fiat_p256_uint1 x392;
+  uint32_t x393;
+  fiat_p256_uint1 x394;
+  uint32_t x395;
+  fiat_p256_uint1 x396;
+  uint32_t x397;
+  fiat_p256_uint1 x398;
+  uint32_t x399;
+  fiat_p256_uint1 x400;
+  uint32_t x401;
+  fiat_p256_uint1 x402;
+  uint32_t x403;
+  fiat_p256_uint1 x404;
+  uint32_t x405;
+  fiat_p256_uint1 x406;
+  uint32_t x407;
+  fiat_p256_uint1 x408;
+  uint32_t x409;
+  fiat_p256_uint1 x410;
+  uint32_t x411;
+  fiat_p256_uint1 x412;
+  uint32_t x413;
+  uint32_t x414;
+  uint32_t x415;
+  uint32_t x416;
+  uint32_t x417;
+  uint32_t x418;
+  uint32_t x419;
+  uint32_t x420;
+  uint32_t x421;
+  uint32_t x422;
+  uint32_t x423;
+  uint32_t x424;
+  uint32_t x425;
+  uint32_t x426;
+  uint32_t x427;
+  fiat_p256_uint1 x428;
+  uint32_t x429;
+  fiat_p256_uint1 x430;
+  uint32_t x431;
+  fiat_p256_uint1 x432;
+  uint32_t x433;
+  fiat_p256_uint1 x434;
+  uint32_t x435;
+  fiat_p256_uint1 x436;
+  uint32_t x437;
+  fiat_p256_uint1 x438;
+  uint32_t x439;
+  fiat_p256_uint1 x440;
+  uint32_t x441;
+  fiat_p256_uint1 x442;
+  uint32_t x443;
+  fiat_p256_uint1 x444;
+  uint32_t x445;
+  fiat_p256_uint1 x446;
+  uint32_t x447;
+  fiat_p256_uint1 x448;
+  uint32_t x449;
+  fiat_p256_uint1 x450;
+  uint32_t x451;
+  fiat_p256_uint1 x452;
+  uint32_t x453;
+  uint32_t x454;
+  uint32_t x455;
+  uint32_t x456;
+  uint32_t x457;
+  uint32_t x458;
+  uint32_t x459;
+  uint32_t x460;
+  uint32_t x461;
+  fiat_p256_uint1 x462;
+  uint32_t x463;
+  fiat_p256_uint1 x464;
+  uint32_t x465;
+  fiat_p256_uint1 x466;
+  uint32_t x467;
+  fiat_p256_uint1 x468;
+  uint32_t x469;
+  fiat_p256_uint1 x470;
+  uint32_t x471;
+  fiat_p256_uint1 x472;
+  uint32_t x473;
+  fiat_p256_uint1 x474;
+  uint32_t x475;
+  fiat_p256_uint1 x476;
+  uint32_t x477;
+  fiat_p256_uint1 x478;
+  uint32_t x479;
+  fiat_p256_uint1 x480;
+  uint32_t x481;
+  fiat_p256_uint1 x482;
+  uint32_t x483;
+  uint32_t x484;
+  uint32_t x485;
+  uint32_t x486;
+  uint32_t x487;
+  uint32_t x488;
+  uint32_t x489;
+  uint32_t x490;
+  uint32_t x491;
+  uint32_t x492;
+  uint32_t x493;
+  uint32_t x494;
+  uint32_t x495;
+  uint32_t x496;
+  uint32_t x497;
+  fiat_p256_uint1 x498;
+  uint32_t x499;
+  fiat_p256_uint1 x500;
+  uint32_t x501;
+  fiat_p256_uint1 x502;
+  uint32_t x503;
+  fiat_p256_uint1 x504;
+  uint32_t x505;
+  fiat_p256_uint1 x506;
+  uint32_t x507;
+  fiat_p256_uint1 x508;
+  uint32_t x509;
+  fiat_p256_uint1 x510;
+  uint32_t x511;
+  fiat_p256_uint1 x512;
+  uint32_t x513;
+  fiat_p256_uint1 x514;
+  uint32_t x515;
+  fiat_p256_uint1 x516;
+  uint32_t x517;
+  fiat_p256_uint1 x518;
+  uint32_t x519;
+  fiat_p256_uint1 x520;
+  uint32_t x521;
+  fiat_p256_uint1 x522;
+  uint32_t x523;
+  uint32_t x524;
+  uint32_t x525;
+  uint32_t x526;
+  uint32_t x527;
+  uint32_t x528;
+  uint32_t x529;
+  uint32_t x530;
+  uint32_t x531;
+  fiat_p256_uint1 x532;
+  uint32_t x533;
+  fiat_p256_uint1 x534;
+  uint32_t x535;
+  fiat_p256_uint1 x536;
+  uint32_t x537;
+  fiat_p256_uint1 x538;
+  uint32_t x539;
+  fiat_p256_uint1 x540;
+  uint32_t x541;
+  fiat_p256_uint1 x542;
+  uint32_t x543;
+  fiat_p256_uint1 x544;
+  uint32_t x545;
+  fiat_p256_uint1 x546;
+  uint32_t x547;
+  fiat_p256_uint1 x548;
+  uint32_t x549;
+  fiat_p256_uint1 x550;
+  uint32_t x551;
+  fiat_p256_uint1 x552;
+  uint32_t x553;
+  fiat_p256_uint1 x554;
+  uint32_t x555;
+  fiat_p256_uint1 x556;
+  uint32_t x557;
+  fiat_p256_uint1 x558;
+  uint32_t x559;
+  fiat_p256_uint1 x560;
+  uint32_t x561;
+  fiat_p256_uint1 x562;
+  uint32_t x563;
+  fiat_p256_uint1 x564;
+  uint32_t x565;
+  fiat_p256_uint1 x566;
+  uint32_t x567;
+  fiat_p256_uint1 x568;
+  uint32_t x569;
+  fiat_p256_uint1 x570;
+  uint32_t x571;
+  uint32_t x572;
+  uint32_t x573;
+  uint32_t x574;
+  uint32_t x575;
+  uint32_t x576;
+  uint32_t x577;
+  uint32_t x578;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[4]);
+  x5 = (arg1[5]);
+  x6 = (arg1[6]);
+  x7 = (arg1[7]);
+  x8 = (arg1[0]);
+  fiat_p256_mulx_u32(&x9, &x10, x8, 0x4);
+  fiat_p256_mulx_u32(&x11, &x12, x8, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x13, &x14, x8, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x15, &x16, x8, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x17, &x18, x8, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x19, &x20, x8, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x21, &x22, x8, 0x3);
+  fiat_p256_addcarryx_u32(&x23, &x24, 0x0, x20, x17);
+  fiat_p256_addcarryx_u32(&x25, &x26, x24, x18, x15);
+  fiat_p256_addcarryx_u32(&x27, &x28, x26, x16, x13);
+  fiat_p256_addcarryx_u32(&x29, &x30, x28, x14, x11);
+  fiat_p256_addcarryx_u32(&x31, &x32, x30, x12, x9);
+  fiat_p256_mulx_u32(&x33, &x34, x21, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x35, &x36, x21, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x37, &x38, x21, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x39, &x40, x21, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x41, &x42, 0x0, x40, x37);
+  fiat_p256_addcarryx_u32(&x43, &x44, x42, x38, x35);
+  fiat_p256_addcarryx_u32(&x45, &x46, 0x0, x21, x39);
+  fiat_p256_addcarryx_u32(&x47, &x48, x46, x22, x41);
+  fiat_p256_addcarryx_u32(&x49, &x50, x48, x19, x43);
+  fiat_p256_addcarryx_u32(&x51, &x52, x50, x23, (x44 + x36));
+  fiat_p256_addcarryx_u32(&x53, &x54, x52, x25, 0x0);
+  fiat_p256_addcarryx_u32(&x55, &x56, x54, x27, 0x0);
+  fiat_p256_addcarryx_u32(&x57, &x58, x56, x29, x21);
+  fiat_p256_addcarryx_u32(&x59, &x60, x58, x31, x33);
+  fiat_p256_addcarryx_u32(&x61, &x62, x60, (x32 + x10), x34);
+  fiat_p256_mulx_u32(&x63, &x64, x1, 0x4);
+  fiat_p256_mulx_u32(&x65, &x66, x1, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x67, &x68, x1, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x69, &x70, x1, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x71, &x72, x1, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x73, &x74, x1, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x75, &x76, x1, 0x3);
+  fiat_p256_addcarryx_u32(&x77, &x78, 0x0, x74, x71);
+  fiat_p256_addcarryx_u32(&x79, &x80, x78, x72, x69);
+  fiat_p256_addcarryx_u32(&x81, &x82, x80, x70, x67);
+  fiat_p256_addcarryx_u32(&x83, &x84, x82, x68, x65);
+  fiat_p256_addcarryx_u32(&x85, &x86, x84, x66, x63);
+  fiat_p256_addcarryx_u32(&x87, &x88, 0x0, x47, x75);
+  fiat_p256_addcarryx_u32(&x89, &x90, x88, x49, x76);
+  fiat_p256_addcarryx_u32(&x91, &x92, x90, x51, x73);
+  fiat_p256_addcarryx_u32(&x93, &x94, x92, x53, x77);
+  fiat_p256_addcarryx_u32(&x95, &x96, x94, x55, x79);
+  fiat_p256_addcarryx_u32(&x97, &x98, x96, x57, x81);
+  fiat_p256_addcarryx_u32(&x99, &x100, x98, x59, x83);
+  fiat_p256_addcarryx_u32(&x101, &x102, x100, x61, x85);
+  fiat_p256_mulx_u32(&x103, &x104, x87, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x105, &x106, x87, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x107, &x108, x87, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x109, &x110, x87, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x111, &x112, 0x0, x110, x107);
+  fiat_p256_addcarryx_u32(&x113, &x114, x112, x108, x105);
+  fiat_p256_addcarryx_u32(&x115, &x116, 0x0, x87, x109);
+  fiat_p256_addcarryx_u32(&x117, &x118, x116, x89, x111);
+  fiat_p256_addcarryx_u32(&x119, &x120, x118, x91, x113);
+  fiat_p256_addcarryx_u32(&x121, &x122, x120, x93, (x114 + x106));
+  fiat_p256_addcarryx_u32(&x123, &x124, x122, x95, 0x0);
+  fiat_p256_addcarryx_u32(&x125, &x126, x124, x97, 0x0);
+  fiat_p256_addcarryx_u32(&x127, &x128, x126, x99, x87);
+  fiat_p256_addcarryx_u32(&x129, &x130, x128, x101, x103);
+  fiat_p256_addcarryx_u32(&x131, &x132, x130, (((uint32_t)x102 + x62) + (x86 + x64)), x104);
+  fiat_p256_mulx_u32(&x133, &x134, x2, 0x4);
+  fiat_p256_mulx_u32(&x135, &x136, x2, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x137, &x138, x2, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x139, &x140, x2, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x141, &x142, x2, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x143, &x144, x2, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x145, &x146, x2, 0x3);
+  fiat_p256_addcarryx_u32(&x147, &x148, 0x0, x144, x141);
+  fiat_p256_addcarryx_u32(&x149, &x150, x148, x142, x139);
+  fiat_p256_addcarryx_u32(&x151, &x152, x150, x140, x137);
+  fiat_p256_addcarryx_u32(&x153, &x154, x152, x138, x135);
+  fiat_p256_addcarryx_u32(&x155, &x156, x154, x136, x133);
+  fiat_p256_addcarryx_u32(&x157, &x158, 0x0, x117, x145);
+  fiat_p256_addcarryx_u32(&x159, &x160, x158, x119, x146);
+  fiat_p256_addcarryx_u32(&x161, &x162, x160, x121, x143);
+  fiat_p256_addcarryx_u32(&x163, &x164, x162, x123, x147);
+  fiat_p256_addcarryx_u32(&x165, &x166, x164, x125, x149);
+  fiat_p256_addcarryx_u32(&x167, &x168, x166, x127, x151);
+  fiat_p256_addcarryx_u32(&x169, &x170, x168, x129, x153);
+  fiat_p256_addcarryx_u32(&x171, &x172, x170, x131, x155);
+  fiat_p256_mulx_u32(&x173, &x174, x157, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x175, &x176, x157, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x177, &x178, x157, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x179, &x180, x157, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x181, &x182, 0x0, x180, x177);
+  fiat_p256_addcarryx_u32(&x183, &x184, x182, x178, x175);
+  fiat_p256_addcarryx_u32(&x185, &x186, 0x0, x157, x179);
+  fiat_p256_addcarryx_u32(&x187, &x188, x186, x159, x181);
+  fiat_p256_addcarryx_u32(&x189, &x190, x188, x161, x183);
+  fiat_p256_addcarryx_u32(&x191, &x192, x190, x163, (x184 + x176));
+  fiat_p256_addcarryx_u32(&x193, &x194, x192, x165, 0x0);
+  fiat_p256_addcarryx_u32(&x195, &x196, x194, x167, 0x0);
+  fiat_p256_addcarryx_u32(&x197, &x198, x196, x169, x157);
+  fiat_p256_addcarryx_u32(&x199, &x200, x198, x171, x173);
+  fiat_p256_addcarryx_u32(&x201, &x202, x200, (((uint32_t)x172 + x132) + (x156 + x134)), x174);
+  fiat_p256_mulx_u32(&x203, &x204, x3, 0x4);
+  fiat_p256_mulx_u32(&x205, &x206, x3, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x207, &x208, x3, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x209, &x210, x3, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x211, &x212, x3, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x213, &x214, x3, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x215, &x216, x3, 0x3);
+  fiat_p256_addcarryx_u32(&x217, &x218, 0x0, x214, x211);
+  fiat_p256_addcarryx_u32(&x219, &x220, x218, x212, x209);
+  fiat_p256_addcarryx_u32(&x221, &x222, x220, x210, x207);
+  fiat_p256_addcarryx_u32(&x223, &x224, x222, x208, x205);
+  fiat_p256_addcarryx_u32(&x225, &x226, x224, x206, x203);
+  fiat_p256_addcarryx_u32(&x227, &x228, 0x0, x187, x215);
+  fiat_p256_addcarryx_u32(&x229, &x230, x228, x189, x216);
+  fiat_p256_addcarryx_u32(&x231, &x232, x230, x191, x213);
+  fiat_p256_addcarryx_u32(&x233, &x234, x232, x193, x217);
+  fiat_p256_addcarryx_u32(&x235, &x236, x234, x195, x219);
+  fiat_p256_addcarryx_u32(&x237, &x238, x236, x197, x221);
+  fiat_p256_addcarryx_u32(&x239, &x240, x238, x199, x223);
+  fiat_p256_addcarryx_u32(&x241, &x242, x240, x201, x225);
+  fiat_p256_mulx_u32(&x243, &x244, x227, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x245, &x246, x227, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x247, &x248, x227, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x249, &x250, x227, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x251, &x252, 0x0, x250, x247);
+  fiat_p256_addcarryx_u32(&x253, &x254, x252, x248, x245);
+  fiat_p256_addcarryx_u32(&x255, &x256, 0x0, x227, x249);
+  fiat_p256_addcarryx_u32(&x257, &x258, x256, x229, x251);
+  fiat_p256_addcarryx_u32(&x259, &x260, x258, x231, x253);
+  fiat_p256_addcarryx_u32(&x261, &x262, x260, x233, (x254 + x246));
+  fiat_p256_addcarryx_u32(&x263, &x264, x262, x235, 0x0);
+  fiat_p256_addcarryx_u32(&x265, &x266, x264, x237, 0x0);
+  fiat_p256_addcarryx_u32(&x267, &x268, x266, x239, x227);
+  fiat_p256_addcarryx_u32(&x269, &x270, x268, x241, x243);
+  fiat_p256_addcarryx_u32(&x271, &x272, x270, (((uint32_t)x242 + x202) + (x226 + x204)), x244);
+  fiat_p256_mulx_u32(&x273, &x274, x4, 0x4);
+  fiat_p256_mulx_u32(&x275, &x276, x4, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x277, &x278, x4, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x279, &x280, x4, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x281, &x282, x4, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x283, &x284, x4, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x285, &x286, x4, 0x3);
+  fiat_p256_addcarryx_u32(&x287, &x288, 0x0, x284, x281);
+  fiat_p256_addcarryx_u32(&x289, &x290, x288, x282, x279);
+  fiat_p256_addcarryx_u32(&x291, &x292, x290, x280, x277);
+  fiat_p256_addcarryx_u32(&x293, &x294, x292, x278, x275);
+  fiat_p256_addcarryx_u32(&x295, &x296, x294, x276, x273);
+  fiat_p256_addcarryx_u32(&x297, &x298, 0x0, x257, x285);
+  fiat_p256_addcarryx_u32(&x299, &x300, x298, x259, x286);
+  fiat_p256_addcarryx_u32(&x301, &x302, x300, x261, x283);
+  fiat_p256_addcarryx_u32(&x303, &x304, x302, x263, x287);
+  fiat_p256_addcarryx_u32(&x305, &x306, x304, x265, x289);
+  fiat_p256_addcarryx_u32(&x307, &x308, x306, x267, x291);
+  fiat_p256_addcarryx_u32(&x309, &x310, x308, x269, x293);
+  fiat_p256_addcarryx_u32(&x311, &x312, x310, x271, x295);
+  fiat_p256_mulx_u32(&x313, &x314, x297, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x315, &x316, x297, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x317, &x318, x297, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x319, &x320, x297, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x321, &x322, 0x0, x320, x317);
+  fiat_p256_addcarryx_u32(&x323, &x324, x322, x318, x315);
+  fiat_p256_addcarryx_u32(&x325, &x326, 0x0, x297, x319);
+  fiat_p256_addcarryx_u32(&x327, &x328, x326, x299, x321);
+  fiat_p256_addcarryx_u32(&x329, &x330, x328, x301, x323);
+  fiat_p256_addcarryx_u32(&x331, &x332, x330, x303, (x324 + x316));
+  fiat_p256_addcarryx_u32(&x333, &x334, x332, x305, 0x0);
+  fiat_p256_addcarryx_u32(&x335, &x336, x334, x307, 0x0);
+  fiat_p256_addcarryx_u32(&x337, &x338, x336, x309, x297);
+  fiat_p256_addcarryx_u32(&x339, &x340, x338, x311, x313);
+  fiat_p256_addcarryx_u32(&x341, &x342, x340, (((uint32_t)x312 + x272) + (x296 + x274)), x314);
+  fiat_p256_mulx_u32(&x343, &x344, x5, 0x4);
+  fiat_p256_mulx_u32(&x345, &x346, x5, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x347, &x348, x5, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x349, &x350, x5, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x351, &x352, x5, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x353, &x354, x5, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x355, &x356, x5, 0x3);
+  fiat_p256_addcarryx_u32(&x357, &x358, 0x0, x354, x351);
+  fiat_p256_addcarryx_u32(&x359, &x360, x358, x352, x349);
+  fiat_p256_addcarryx_u32(&x361, &x362, x360, x350, x347);
+  fiat_p256_addcarryx_u32(&x363, &x364, x362, x348, x345);
+  fiat_p256_addcarryx_u32(&x365, &x366, x364, x346, x343);
+  fiat_p256_addcarryx_u32(&x367, &x368, 0x0, x327, x355);
+  fiat_p256_addcarryx_u32(&x369, &x370, x368, x329, x356);
+  fiat_p256_addcarryx_u32(&x371, &x372, x370, x331, x353);
+  fiat_p256_addcarryx_u32(&x373, &x374, x372, x333, x357);
+  fiat_p256_addcarryx_u32(&x375, &x376, x374, x335, x359);
+  fiat_p256_addcarryx_u32(&x377, &x378, x376, x337, x361);
+  fiat_p256_addcarryx_u32(&x379, &x380, x378, x339, x363);
+  fiat_p256_addcarryx_u32(&x381, &x382, x380, x341, x365);
+  fiat_p256_mulx_u32(&x383, &x384, x367, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x385, &x386, x367, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x387, &x388, x367, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x389, &x390, x367, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x391, &x392, 0x0, x390, x387);
+  fiat_p256_addcarryx_u32(&x393, &x394, x392, x388, x385);
+  fiat_p256_addcarryx_u32(&x395, &x396, 0x0, x367, x389);
+  fiat_p256_addcarryx_u32(&x397, &x398, x396, x369, x391);
+  fiat_p256_addcarryx_u32(&x399, &x400, x398, x371, x393);
+  fiat_p256_addcarryx_u32(&x401, &x402, x400, x373, (x394 + x386));
+  fiat_p256_addcarryx_u32(&x403, &x404, x402, x375, 0x0);
+  fiat_p256_addcarryx_u32(&x405, &x406, x404, x377, 0x0);
+  fiat_p256_addcarryx_u32(&x407, &x408, x406, x379, x367);
+  fiat_p256_addcarryx_u32(&x409, &x410, x408, x381, x383);
+  fiat_p256_addcarryx_u32(&x411, &x412, x410, (((uint32_t)x382 + x342) + (x366 + x344)), x384);
+  fiat_p256_mulx_u32(&x413, &x414, x6, 0x4);
+  fiat_p256_mulx_u32(&x415, &x416, x6, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x417, &x418, x6, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x419, &x420, x6, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x421, &x422, x6, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x423, &x424, x6, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x425, &x426, x6, 0x3);
+  fiat_p256_addcarryx_u32(&x427, &x428, 0x0, x424, x421);
+  fiat_p256_addcarryx_u32(&x429, &x430, x428, x422, x419);
+  fiat_p256_addcarryx_u32(&x431, &x432, x430, x420, x417);
+  fiat_p256_addcarryx_u32(&x433, &x434, x432, x418, x415);
+  fiat_p256_addcarryx_u32(&x435, &x436, x434, x416, x413);
+  fiat_p256_addcarryx_u32(&x437, &x438, 0x0, x397, x425);
+  fiat_p256_addcarryx_u32(&x439, &x440, x438, x399, x426);
+  fiat_p256_addcarryx_u32(&x441, &x442, x440, x401, x423);
+  fiat_p256_addcarryx_u32(&x443, &x444, x442, x403, x427);
+  fiat_p256_addcarryx_u32(&x445, &x446, x444, x405, x429);
+  fiat_p256_addcarryx_u32(&x447, &x448, x446, x407, x431);
+  fiat_p256_addcarryx_u32(&x449, &x450, x448, x409, x433);
+  fiat_p256_addcarryx_u32(&x451, &x452, x450, x411, x435);
+  fiat_p256_mulx_u32(&x453, &x454, x437, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x455, &x456, x437, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x457, &x458, x437, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x459, &x460, x437, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x461, &x462, 0x0, x460, x457);
+  fiat_p256_addcarryx_u32(&x463, &x464, x462, x458, x455);
+  fiat_p256_addcarryx_u32(&x465, &x466, 0x0, x437, x459);
+  fiat_p256_addcarryx_u32(&x467, &x468, x466, x439, x461);
+  fiat_p256_addcarryx_u32(&x469, &x470, x468, x441, x463);
+  fiat_p256_addcarryx_u32(&x471, &x472, x470, x443, (x464 + x456));
+  fiat_p256_addcarryx_u32(&x473, &x474, x472, x445, 0x0);
+  fiat_p256_addcarryx_u32(&x475, &x476, x474, x447, 0x0);
+  fiat_p256_addcarryx_u32(&x477, &x478, x476, x449, x437);
+  fiat_p256_addcarryx_u32(&x479, &x480, x478, x451, x453);
+  fiat_p256_addcarryx_u32(&x481, &x482, x480, (((uint32_t)x452 + x412) + (x436 + x414)), x454);
+  fiat_p256_mulx_u32(&x483, &x484, x7, 0x4);
+  fiat_p256_mulx_u32(&x485, &x486, x7, UINT32_C(0xfffffffd));
+  fiat_p256_mulx_u32(&x487, &x488, x7, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x489, &x490, x7, UINT32_C(0xfffffffe));
+  fiat_p256_mulx_u32(&x491, &x492, x7, UINT32_C(0xfffffffb));
+  fiat_p256_mulx_u32(&x493, &x494, x7, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x495, &x496, x7, 0x3);
+  fiat_p256_addcarryx_u32(&x497, &x498, 0x0, x494, x491);
+  fiat_p256_addcarryx_u32(&x499, &x500, x498, x492, x489);
+  fiat_p256_addcarryx_u32(&x501, &x502, x500, x490, x487);
+  fiat_p256_addcarryx_u32(&x503, &x504, x502, x488, x485);
+  fiat_p256_addcarryx_u32(&x505, &x506, x504, x486, x483);
+  fiat_p256_addcarryx_u32(&x507, &x508, 0x0, x467, x495);
+  fiat_p256_addcarryx_u32(&x509, &x510, x508, x469, x496);
+  fiat_p256_addcarryx_u32(&x511, &x512, x510, x471, x493);
+  fiat_p256_addcarryx_u32(&x513, &x514, x512, x473, x497);
+  fiat_p256_addcarryx_u32(&x515, &x516, x514, x475, x499);
+  fiat_p256_addcarryx_u32(&x517, &x518, x516, x477, x501);
+  fiat_p256_addcarryx_u32(&x519, &x520, x518, x479, x503);
+  fiat_p256_addcarryx_u32(&x521, &x522, x520, x481, x505);
+  fiat_p256_mulx_u32(&x523, &x524, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x525, &x526, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x527, &x528, x507, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u32(&x529, &x530, x507, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x531, &x532, 0x0, x530, x527);
+  fiat_p256_addcarryx_u32(&x533, &x534, x532, x528, x525);
+  fiat_p256_addcarryx_u32(&x535, &x536, 0x0, x507, x529);
+  fiat_p256_addcarryx_u32(&x537, &x538, x536, x509, x531);
+  fiat_p256_addcarryx_u32(&x539, &x540, x538, x511, x533);
+  fiat_p256_addcarryx_u32(&x541, &x542, x540, x513, (x534 + x526));
+  fiat_p256_addcarryx_u32(&x543, &x544, x542, x515, 0x0);
+  fiat_p256_addcarryx_u32(&x545, &x546, x544, x517, 0x0);
+  fiat_p256_addcarryx_u32(&x547, &x548, x546, x519, x507);
+  fiat_p256_addcarryx_u32(&x549, &x550, x548, x521, x523);
+  fiat_p256_addcarryx_u32(&x551, &x552, x550, (((uint32_t)x522 + x482) + (x506 + x484)), x524);
+  fiat_p256_subborrowx_u32(&x553, &x554, 0x0, x537, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x555, &x556, x554, x539, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x557, &x558, x556, x541, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x559, &x560, x558, x543, 0x0);
+  fiat_p256_subborrowx_u32(&x561, &x562, x560, x545, 0x0);
+  fiat_p256_subborrowx_u32(&x563, &x564, x562, x547, 0x0);
+  fiat_p256_subborrowx_u32(&x565, &x566, x564, x549, 0x1);
+  fiat_p256_subborrowx_u32(&x567, &x568, x566, x551, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x569, &x570, x568, x552, 0x0);
+  fiat_p256_cmovznz_u32(&x571, x570, x553, x537);
+  fiat_p256_cmovznz_u32(&x572, x570, x555, x539);
+  fiat_p256_cmovznz_u32(&x573, x570, x557, x541);
+  fiat_p256_cmovznz_u32(&x574, x570, x559, x543);
+  fiat_p256_cmovznz_u32(&x575, x570, x561, x545);
+  fiat_p256_cmovznz_u32(&x576, x570, x563, x547);
+  fiat_p256_cmovznz_u32(&x577, x570, x565, x549);
+  fiat_p256_cmovznz_u32(&x578, x570, x567, x551);
+  out1[0] = x571;
+  out1[1] = x572;
+  out1[2] = x573;
+  out1[3] = x574;
+  out1[4] = x575;
+  out1[5] = x576;
+  out1[6] = x577;
+  out1[7] = x578;
+}
+
+/*
  * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
@@ -2920,13 +3897,15 @@
  * Output Bounds:
  *   out1: [0x0 ~> 0xffffffff]
  */
-static void fiat_p256_nonzero(uint32_t* out1, const uint32_t arg1[8]) {
-  uint32_t x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | ((arg1[3]) | ((arg1[4]) | ((arg1[5]) | ((arg1[6]) | ((arg1[7]) | (uint32_t)0x0))))))));
+static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint32_t* out1, const uint32_t arg1[8]) {
+  uint32_t x1;
+  x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | ((arg1[3]) | ((arg1[4]) | ((arg1[5]) | ((arg1[6]) | (arg1[7]))))))));
   *out1 = x1;
 }
 
 /*
  * The function fiat_p256_selectznz is a multi-limb conditional select.
+ *
  * Postconditions:
  *   eval out1 = (if arg1 = 0 then eval arg2 else eval arg3)
  *
@@ -2937,22 +3916,22 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_selectznz(uint32_t out1[8], fiat_p256_uint1 arg1, const uint32_t arg2[8], const uint32_t arg3[8]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint32_t out1[8], fiat_p256_uint1 arg1, const uint32_t arg2[8], const uint32_t arg3[8]) {
   uint32_t x1;
-  fiat_p256_cmovznz_u32(&x1, arg1, (arg2[0]), (arg3[0]));
   uint32_t x2;
-  fiat_p256_cmovznz_u32(&x2, arg1, (arg2[1]), (arg3[1]));
   uint32_t x3;
-  fiat_p256_cmovznz_u32(&x3, arg1, (arg2[2]), (arg3[2]));
   uint32_t x4;
-  fiat_p256_cmovznz_u32(&x4, arg1, (arg2[3]), (arg3[3]));
   uint32_t x5;
-  fiat_p256_cmovznz_u32(&x5, arg1, (arg2[4]), (arg3[4]));
   uint32_t x6;
-  fiat_p256_cmovznz_u32(&x6, arg1, (arg2[5]), (arg3[5]));
   uint32_t x7;
-  fiat_p256_cmovznz_u32(&x7, arg1, (arg2[6]), (arg3[6]));
   uint32_t x8;
+  fiat_p256_cmovznz_u32(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u32(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u32(&x3, arg1, (arg2[2]), (arg3[2]));
+  fiat_p256_cmovznz_u32(&x4, arg1, (arg2[3]), (arg3[3]));
+  fiat_p256_cmovznz_u32(&x5, arg1, (arg2[4]), (arg3[4]));
+  fiat_p256_cmovznz_u32(&x6, arg1, (arg2[5]), (arg3[5]));
+  fiat_p256_cmovznz_u32(&x7, arg1, (arg2[6]), (arg3[6]));
   fiat_p256_cmovznz_u32(&x8, arg1, (arg2[7]), (arg3[7]));
   out1[0] = x1;
   out1[1] = x2;
@@ -2965,7 +3944,8 @@
 }
 
 /*
- * The function fiat_p256_to_bytes serializes a field element in the Montgomery domain to bytes in little-endian order.
+ * The function fiat_p256_to_bytes serializes a field element NOT in the Montgomery domain to bytes in little-endian order.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
@@ -2976,106 +3956,156 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]]
  */
-static void fiat_p256_to_bytes(uint8_t out1[32], const uint32_t arg1[8]) {
-  uint32_t x1 = (arg1[7]);
-  uint32_t x2 = (arg1[6]);
-  uint32_t x3 = (arg1[5]);
-  uint32_t x4 = (arg1[4]);
-  uint32_t x5 = (arg1[3]);
-  uint32_t x6 = (arg1[2]);
-  uint32_t x7 = (arg1[1]);
-  uint32_t x8 = (arg1[0]);
-  uint32_t x9 = (x8 >> 8);
-  uint8_t x10 = (uint8_t)(x8 & UINT8_C(0xff));
-  uint32_t x11 = (x9 >> 8);
-  uint8_t x12 = (uint8_t)(x9 & UINT8_C(0xff));
-  uint8_t x13 = (uint8_t)(x11 >> 8);
-  uint8_t x14 = (uint8_t)(x11 & UINT8_C(0xff));
-  uint8_t x15 = (uint8_t)(x13 & UINT8_C(0xff));
-  uint32_t x16 = (x7 >> 8);
-  uint8_t x17 = (uint8_t)(x7 & UINT8_C(0xff));
-  uint32_t x18 = (x16 >> 8);
-  uint8_t x19 = (uint8_t)(x16 & UINT8_C(0xff));
-  uint8_t x20 = (uint8_t)(x18 >> 8);
-  uint8_t x21 = (uint8_t)(x18 & UINT8_C(0xff));
-  uint8_t x22 = (uint8_t)(x20 & UINT8_C(0xff));
-  uint32_t x23 = (x6 >> 8);
-  uint8_t x24 = (uint8_t)(x6 & UINT8_C(0xff));
-  uint32_t x25 = (x23 >> 8);
-  uint8_t x26 = (uint8_t)(x23 & UINT8_C(0xff));
-  uint8_t x27 = (uint8_t)(x25 >> 8);
-  uint8_t x28 = (uint8_t)(x25 & UINT8_C(0xff));
-  uint8_t x29 = (uint8_t)(x27 & UINT8_C(0xff));
-  uint32_t x30 = (x5 >> 8);
-  uint8_t x31 = (uint8_t)(x5 & UINT8_C(0xff));
-  uint32_t x32 = (x30 >> 8);
-  uint8_t x33 = (uint8_t)(x30 & UINT8_C(0xff));
-  uint8_t x34 = (uint8_t)(x32 >> 8);
-  uint8_t x35 = (uint8_t)(x32 & UINT8_C(0xff));
-  uint8_t x36 = (uint8_t)(x34 & UINT8_C(0xff));
-  uint32_t x37 = (x4 >> 8);
-  uint8_t x38 = (uint8_t)(x4 & UINT8_C(0xff));
-  uint32_t x39 = (x37 >> 8);
-  uint8_t x40 = (uint8_t)(x37 & UINT8_C(0xff));
-  uint8_t x41 = (uint8_t)(x39 >> 8);
-  uint8_t x42 = (uint8_t)(x39 & UINT8_C(0xff));
-  uint8_t x43 = (uint8_t)(x41 & UINT8_C(0xff));
-  uint32_t x44 = (x3 >> 8);
-  uint8_t x45 = (uint8_t)(x3 & UINT8_C(0xff));
-  uint32_t x46 = (x44 >> 8);
-  uint8_t x47 = (uint8_t)(x44 & UINT8_C(0xff));
-  uint8_t x48 = (uint8_t)(x46 >> 8);
-  uint8_t x49 = (uint8_t)(x46 & UINT8_C(0xff));
-  uint8_t x50 = (uint8_t)(x48 & UINT8_C(0xff));
-  uint32_t x51 = (x2 >> 8);
-  uint8_t x52 = (uint8_t)(x2 & UINT8_C(0xff));
-  uint32_t x53 = (x51 >> 8);
-  uint8_t x54 = (uint8_t)(x51 & UINT8_C(0xff));
-  uint8_t x55 = (uint8_t)(x53 >> 8);
-  uint8_t x56 = (uint8_t)(x53 & UINT8_C(0xff));
-  uint8_t x57 = (uint8_t)(x55 & UINT8_C(0xff));
-  uint32_t x58 = (x1 >> 8);
-  uint8_t x59 = (uint8_t)(x1 & UINT8_C(0xff));
-  uint32_t x60 = (x58 >> 8);
-  uint8_t x61 = (uint8_t)(x58 & UINT8_C(0xff));
-  uint8_t x62 = (uint8_t)(x60 >> 8);
-  uint8_t x63 = (uint8_t)(x60 & UINT8_C(0xff));
-  out1[0] = x10;
-  out1[1] = x12;
-  out1[2] = x14;
-  out1[3] = x15;
-  out1[4] = x17;
-  out1[5] = x19;
-  out1[6] = x21;
-  out1[7] = x22;
-  out1[8] = x24;
-  out1[9] = x26;
-  out1[10] = x28;
-  out1[11] = x29;
-  out1[12] = x31;
-  out1[13] = x33;
-  out1[14] = x35;
-  out1[15] = x36;
-  out1[16] = x38;
-  out1[17] = x40;
-  out1[18] = x42;
-  out1[19] = x43;
-  out1[20] = x45;
-  out1[21] = x47;
-  out1[22] = x49;
-  out1[23] = x50;
-  out1[24] = x52;
-  out1[25] = x54;
-  out1[26] = x56;
-  out1[27] = x57;
-  out1[28] = x59;
-  out1[29] = x61;
-  out1[30] = x63;
-  out1[31] = x62;
+static FIAT_P256_FIAT_INLINE void fiat_p256_to_bytes(uint8_t out1[32], const uint32_t arg1[8]) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint32_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint8_t x9;
+  uint32_t x10;
+  uint8_t x11;
+  uint32_t x12;
+  uint8_t x13;
+  uint8_t x14;
+  uint8_t x15;
+  uint32_t x16;
+  uint8_t x17;
+  uint32_t x18;
+  uint8_t x19;
+  uint8_t x20;
+  uint8_t x21;
+  uint32_t x22;
+  uint8_t x23;
+  uint32_t x24;
+  uint8_t x25;
+  uint8_t x26;
+  uint8_t x27;
+  uint32_t x28;
+  uint8_t x29;
+  uint32_t x30;
+  uint8_t x31;
+  uint8_t x32;
+  uint8_t x33;
+  uint32_t x34;
+  uint8_t x35;
+  uint32_t x36;
+  uint8_t x37;
+  uint8_t x38;
+  uint8_t x39;
+  uint32_t x40;
+  uint8_t x41;
+  uint32_t x42;
+  uint8_t x43;
+  uint8_t x44;
+  uint8_t x45;
+  uint32_t x46;
+  uint8_t x47;
+  uint32_t x48;
+  uint8_t x49;
+  uint8_t x50;
+  uint8_t x51;
+  uint32_t x52;
+  uint8_t x53;
+  uint32_t x54;
+  uint8_t x55;
+  uint8_t x56;
+  x1 = (arg1[7]);
+  x2 = (arg1[6]);
+  x3 = (arg1[5]);
+  x4 = (arg1[4]);
+  x5 = (arg1[3]);
+  x6 = (arg1[2]);
+  x7 = (arg1[1]);
+  x8 = (arg1[0]);
+  x9 = (uint8_t)(x8 & UINT8_C(0xff));
+  x10 = (x8 >> 8);
+  x11 = (uint8_t)(x10 & UINT8_C(0xff));
+  x12 = (x10 >> 8);
+  x13 = (uint8_t)(x12 & UINT8_C(0xff));
+  x14 = (uint8_t)(x12 >> 8);
+  x15 = (uint8_t)(x7 & UINT8_C(0xff));
+  x16 = (x7 >> 8);
+  x17 = (uint8_t)(x16 & UINT8_C(0xff));
+  x18 = (x16 >> 8);
+  x19 = (uint8_t)(x18 & UINT8_C(0xff));
+  x20 = (uint8_t)(x18 >> 8);
+  x21 = (uint8_t)(x6 & UINT8_C(0xff));
+  x22 = (x6 >> 8);
+  x23 = (uint8_t)(x22 & UINT8_C(0xff));
+  x24 = (x22 >> 8);
+  x25 = (uint8_t)(x24 & UINT8_C(0xff));
+  x26 = (uint8_t)(x24 >> 8);
+  x27 = (uint8_t)(x5 & UINT8_C(0xff));
+  x28 = (x5 >> 8);
+  x29 = (uint8_t)(x28 & UINT8_C(0xff));
+  x30 = (x28 >> 8);
+  x31 = (uint8_t)(x30 & UINT8_C(0xff));
+  x32 = (uint8_t)(x30 >> 8);
+  x33 = (uint8_t)(x4 & UINT8_C(0xff));
+  x34 = (x4 >> 8);
+  x35 = (uint8_t)(x34 & UINT8_C(0xff));
+  x36 = (x34 >> 8);
+  x37 = (uint8_t)(x36 & UINT8_C(0xff));
+  x38 = (uint8_t)(x36 >> 8);
+  x39 = (uint8_t)(x3 & UINT8_C(0xff));
+  x40 = (x3 >> 8);
+  x41 = (uint8_t)(x40 & UINT8_C(0xff));
+  x42 = (x40 >> 8);
+  x43 = (uint8_t)(x42 & UINT8_C(0xff));
+  x44 = (uint8_t)(x42 >> 8);
+  x45 = (uint8_t)(x2 & UINT8_C(0xff));
+  x46 = (x2 >> 8);
+  x47 = (uint8_t)(x46 & UINT8_C(0xff));
+  x48 = (x46 >> 8);
+  x49 = (uint8_t)(x48 & UINT8_C(0xff));
+  x50 = (uint8_t)(x48 >> 8);
+  x51 = (uint8_t)(x1 & UINT8_C(0xff));
+  x52 = (x1 >> 8);
+  x53 = (uint8_t)(x52 & UINT8_C(0xff));
+  x54 = (x52 >> 8);
+  x55 = (uint8_t)(x54 & UINT8_C(0xff));
+  x56 = (uint8_t)(x54 >> 8);
+  out1[0] = x9;
+  out1[1] = x11;
+  out1[2] = x13;
+  out1[3] = x14;
+  out1[4] = x15;
+  out1[5] = x17;
+  out1[6] = x19;
+  out1[7] = x20;
+  out1[8] = x21;
+  out1[9] = x23;
+  out1[10] = x25;
+  out1[11] = x26;
+  out1[12] = x27;
+  out1[13] = x29;
+  out1[14] = x31;
+  out1[15] = x32;
+  out1[16] = x33;
+  out1[17] = x35;
+  out1[18] = x37;
+  out1[19] = x38;
+  out1[20] = x39;
+  out1[21] = x41;
+  out1[22] = x43;
+  out1[23] = x44;
+  out1[24] = x45;
+  out1[25] = x47;
+  out1[26] = x49;
+  out1[27] = x50;
+  out1[28] = x51;
+  out1[29] = x53;
+  out1[30] = x55;
+  out1[31] = x56;
 }
 
 /*
- * The function fiat_p256_from_bytes deserializes a field element in the Montgomery domain from bytes in little-endian order.
+ * The function fiat_p256_from_bytes deserializes a field element NOT in the Montgomery domain from bytes in little-endian order.
+ *
  * Preconditions:
  *   0 ≤ bytes_eval arg1 < m
  * Postconditions:
@@ -3087,61 +4117,644 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
  */
-static void fiat_p256_from_bytes(uint32_t out1[8], const uint8_t arg1[32]) {
-  uint32_t x1 = ((uint32_t)(arg1[31]) << 24);
-  uint32_t x2 = ((uint32_t)(arg1[30]) << 16);
-  uint32_t x3 = ((uint32_t)(arg1[29]) << 8);
-  uint8_t x4 = (arg1[28]);
-  uint32_t x5 = ((uint32_t)(arg1[27]) << 24);
-  uint32_t x6 = ((uint32_t)(arg1[26]) << 16);
-  uint32_t x7 = ((uint32_t)(arg1[25]) << 8);
-  uint8_t x8 = (arg1[24]);
-  uint32_t x9 = ((uint32_t)(arg1[23]) << 24);
-  uint32_t x10 = ((uint32_t)(arg1[22]) << 16);
-  uint32_t x11 = ((uint32_t)(arg1[21]) << 8);
-  uint8_t x12 = (arg1[20]);
-  uint32_t x13 = ((uint32_t)(arg1[19]) << 24);
-  uint32_t x14 = ((uint32_t)(arg1[18]) << 16);
-  uint32_t x15 = ((uint32_t)(arg1[17]) << 8);
-  uint8_t x16 = (arg1[16]);
-  uint32_t x17 = ((uint32_t)(arg1[15]) << 24);
-  uint32_t x18 = ((uint32_t)(arg1[14]) << 16);
-  uint32_t x19 = ((uint32_t)(arg1[13]) << 8);
-  uint8_t x20 = (arg1[12]);
-  uint32_t x21 = ((uint32_t)(arg1[11]) << 24);
-  uint32_t x22 = ((uint32_t)(arg1[10]) << 16);
-  uint32_t x23 = ((uint32_t)(arg1[9]) << 8);
-  uint8_t x24 = (arg1[8]);
-  uint32_t x25 = ((uint32_t)(arg1[7]) << 24);
-  uint32_t x26 = ((uint32_t)(arg1[6]) << 16);
-  uint32_t x27 = ((uint32_t)(arg1[5]) << 8);
-  uint8_t x28 = (arg1[4]);
-  uint32_t x29 = ((uint32_t)(arg1[3]) << 24);
-  uint32_t x30 = ((uint32_t)(arg1[2]) << 16);
-  uint32_t x31 = ((uint32_t)(arg1[1]) << 8);
-  uint8_t x32 = (arg1[0]);
-  uint32_t x33 = (x32 + (x31 + (x30 + x29)));
-  uint32_t x34 = (x33 & UINT32_C(0xffffffff));
-  uint32_t x35 = (x4 + (x3 + (x2 + x1)));
-  uint32_t x36 = (x8 + (x7 + (x6 + x5)));
-  uint32_t x37 = (x12 + (x11 + (x10 + x9)));
-  uint32_t x38 = (x16 + (x15 + (x14 + x13)));
-  uint32_t x39 = (x20 + (x19 + (x18 + x17)));
-  uint32_t x40 = (x24 + (x23 + (x22 + x21)));
-  uint32_t x41 = (x28 + (x27 + (x26 + x25)));
-  uint32_t x42 = (x41 & UINT32_C(0xffffffff));
-  uint32_t x43 = (x40 & UINT32_C(0xffffffff));
-  uint32_t x44 = (x39 & UINT32_C(0xffffffff));
-  uint32_t x45 = (x38 & UINT32_C(0xffffffff));
-  uint32_t x46 = (x37 & UINT32_C(0xffffffff));
-  uint32_t x47 = (x36 & UINT32_C(0xffffffff));
-  out1[0] = x34;
-  out1[1] = x42;
-  out1[2] = x43;
+static FIAT_P256_FIAT_INLINE void fiat_p256_from_bytes(uint32_t out1[8], const uint8_t arg1[32]) {
+  uint32_t x1;
+  uint32_t x2;
+  uint32_t x3;
+  uint8_t x4;
+  uint32_t x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint8_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint8_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint8_t x16;
+  uint32_t x17;
+  uint32_t x18;
+  uint32_t x19;
+  uint8_t x20;
+  uint32_t x21;
+  uint32_t x22;
+  uint32_t x23;
+  uint8_t x24;
+  uint32_t x25;
+  uint32_t x26;
+  uint32_t x27;
+  uint8_t x28;
+  uint32_t x29;
+  uint32_t x30;
+  uint32_t x31;
+  uint8_t x32;
+  uint32_t x33;
+  uint32_t x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint32_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  uint32_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint32_t x47;
+  uint32_t x48;
+  uint32_t x49;
+  uint32_t x50;
+  uint32_t x51;
+  uint32_t x52;
+  uint32_t x53;
+  uint32_t x54;
+  uint32_t x55;
+  uint32_t x56;
+  x1 = ((uint32_t)(arg1[31]) << 24);
+  x2 = ((uint32_t)(arg1[30]) << 16);
+  x3 = ((uint32_t)(arg1[29]) << 8);
+  x4 = (arg1[28]);
+  x5 = ((uint32_t)(arg1[27]) << 24);
+  x6 = ((uint32_t)(arg1[26]) << 16);
+  x7 = ((uint32_t)(arg1[25]) << 8);
+  x8 = (arg1[24]);
+  x9 = ((uint32_t)(arg1[23]) << 24);
+  x10 = ((uint32_t)(arg1[22]) << 16);
+  x11 = ((uint32_t)(arg1[21]) << 8);
+  x12 = (arg1[20]);
+  x13 = ((uint32_t)(arg1[19]) << 24);
+  x14 = ((uint32_t)(arg1[18]) << 16);
+  x15 = ((uint32_t)(arg1[17]) << 8);
+  x16 = (arg1[16]);
+  x17 = ((uint32_t)(arg1[15]) << 24);
+  x18 = ((uint32_t)(arg1[14]) << 16);
+  x19 = ((uint32_t)(arg1[13]) << 8);
+  x20 = (arg1[12]);
+  x21 = ((uint32_t)(arg1[11]) << 24);
+  x22 = ((uint32_t)(arg1[10]) << 16);
+  x23 = ((uint32_t)(arg1[9]) << 8);
+  x24 = (arg1[8]);
+  x25 = ((uint32_t)(arg1[7]) << 24);
+  x26 = ((uint32_t)(arg1[6]) << 16);
+  x27 = ((uint32_t)(arg1[5]) << 8);
+  x28 = (arg1[4]);
+  x29 = ((uint32_t)(arg1[3]) << 24);
+  x30 = ((uint32_t)(arg1[2]) << 16);
+  x31 = ((uint32_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint32_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x27 + (uint32_t)x28);
+  x37 = (x26 + x36);
+  x38 = (x25 + x37);
+  x39 = (x23 + (uint32_t)x24);
+  x40 = (x22 + x39);
+  x41 = (x21 + x40);
+  x42 = (x19 + (uint32_t)x20);
+  x43 = (x18 + x42);
+  x44 = (x17 + x43);
+  x45 = (x15 + (uint32_t)x16);
+  x46 = (x14 + x45);
+  x47 = (x13 + x46);
+  x48 = (x11 + (uint32_t)x12);
+  x49 = (x10 + x48);
+  x50 = (x9 + x49);
+  x51 = (x7 + (uint32_t)x8);
+  x52 = (x6 + x51);
+  x53 = (x5 + x52);
+  x54 = (x3 + (uint32_t)x4);
+  x55 = (x2 + x54);
+  x56 = (x1 + x55);
+  out1[0] = x35;
+  out1[1] = x38;
+  out1[2] = x41;
   out1[3] = x44;
-  out1[4] = x45;
-  out1[5] = x46;
-  out1[6] = x47;
-  out1[7] = x35;
+  out1[4] = x47;
+  out1[5] = x50;
+  out1[6] = x53;
+  out1[7] = x56;
 }
 
+/*
+ * The function fiat_p256_set_one returns the field element one in the Montgomery domain.
+ *
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = 1 mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_set_one(fiat_p256_montgomery_domain_field_element out1) {
+  out1[0] = 0x1;
+  out1[1] = 0x0;
+  out1[2] = 0x0;
+  out1[3] = UINT32_C(0xffffffff);
+  out1[4] = UINT32_C(0xffffffff);
+  out1[5] = UINT32_C(0xffffffff);
+  out1[6] = UINT32_C(0xfffffffe);
+  out1[7] = 0x0;
+}
+
+/*
+ * The function fiat_p256_msat returns the saturated representation of the prime modulus.
+ *
+ * Postconditions:
+ *   twos_complement_eval out1 = m
+ *   0 ≤ eval out1 < m
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_msat(uint32_t out1[9]) {
+  out1[0] = UINT32_C(0xffffffff);
+  out1[1] = UINT32_C(0xffffffff);
+  out1[2] = UINT32_C(0xffffffff);
+  out1[3] = 0x0;
+  out1[4] = 0x0;
+  out1[5] = 0x0;
+  out1[6] = 0x1;
+  out1[7] = UINT32_C(0xffffffff);
+  out1[8] = 0x0;
+}
+
+/*
+ * The function fiat_p256_divstep computes a divstep.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg4 < m
+ *   0 ≤ eval arg5 < m
+ * Postconditions:
+ *   out1 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then 1 - arg1 else 1 + arg1)
+ *   twos_complement_eval out2 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then twos_complement_eval arg3 else twos_complement_eval arg2)
+ *   twos_complement_eval out3 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then ⌊(twos_complement_eval arg3 - twos_complement_eval arg2) / 2⌋ else ⌊(twos_complement_eval arg3 + (twos_complement_eval arg3 mod 2) * twos_complement_eval arg2) / 2⌋)
+ *   eval (from_montgomery out4) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (2 * eval (from_montgomery arg5)) mod m else (2 * eval (from_montgomery arg4)) mod m)
+ *   eval (from_montgomery out5) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (eval (from_montgomery arg4) - eval (from_montgomery arg4)) mod m else (eval (from_montgomery arg5) + (twos_complement_eval arg3 mod 2) * eval (from_montgomery arg4)) mod m)
+ *   0 ≤ eval out5 < m
+ *   0 ≤ eval out5 < m
+ *   0 ≤ eval out2 < m
+ *   0 ≤ eval out3 < m
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffff]
+ *   arg2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ *   arg4: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ *   arg5: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffff]
+ *   out2: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ *   out3: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ *   out4: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ *   out5: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_divstep(uint32_t* out1, uint32_t out2[9], uint32_t out3[9], uint32_t out4[8], uint32_t out5[8], uint32_t arg1, const uint32_t arg2[9], const uint32_t arg3[9], const uint32_t arg4[8], const uint32_t arg5[8]) {
+  uint32_t x1;
+  fiat_p256_uint1 x2;
+  fiat_p256_uint1 x3;
+  uint32_t x4;
+  fiat_p256_uint1 x5;
+  uint32_t x6;
+  uint32_t x7;
+  uint32_t x8;
+  uint32_t x9;
+  uint32_t x10;
+  uint32_t x11;
+  uint32_t x12;
+  uint32_t x13;
+  uint32_t x14;
+  uint32_t x15;
+  uint32_t x16;
+  fiat_p256_uint1 x17;
+  uint32_t x18;
+  fiat_p256_uint1 x19;
+  uint32_t x20;
+  fiat_p256_uint1 x21;
+  uint32_t x22;
+  fiat_p256_uint1 x23;
+  uint32_t x24;
+  fiat_p256_uint1 x25;
+  uint32_t x26;
+  fiat_p256_uint1 x27;
+  uint32_t x28;
+  fiat_p256_uint1 x29;
+  uint32_t x30;
+  fiat_p256_uint1 x31;
+  uint32_t x32;
+  fiat_p256_uint1 x33;
+  uint32_t x34;
+  uint32_t x35;
+  uint32_t x36;
+  uint32_t x37;
+  uint32_t x38;
+  uint32_t x39;
+  uint32_t x40;
+  uint32_t x41;
+  uint32_t x42;
+  uint32_t x43;
+  uint32_t x44;
+  uint32_t x45;
+  uint32_t x46;
+  uint32_t x47;
+  uint32_t x48;
+  uint32_t x49;
+  uint32_t x50;
+  uint32_t x51;
+  fiat_p256_uint1 x52;
+  uint32_t x53;
+  fiat_p256_uint1 x54;
+  uint32_t x55;
+  fiat_p256_uint1 x56;
+  uint32_t x57;
+  fiat_p256_uint1 x58;
+  uint32_t x59;
+  fiat_p256_uint1 x60;
+  uint32_t x61;
+  fiat_p256_uint1 x62;
+  uint32_t x63;
+  fiat_p256_uint1 x64;
+  uint32_t x65;
+  fiat_p256_uint1 x66;
+  uint32_t x67;
+  fiat_p256_uint1 x68;
+  uint32_t x69;
+  fiat_p256_uint1 x70;
+  uint32_t x71;
+  fiat_p256_uint1 x72;
+  uint32_t x73;
+  fiat_p256_uint1 x74;
+  uint32_t x75;
+  fiat_p256_uint1 x76;
+  uint32_t x77;
+  fiat_p256_uint1 x78;
+  uint32_t x79;
+  fiat_p256_uint1 x80;
+  uint32_t x81;
+  fiat_p256_uint1 x82;
+  uint32_t x83;
+  fiat_p256_uint1 x84;
+  uint32_t x85;
+  uint32_t x86;
+  uint32_t x87;
+  uint32_t x88;
+  uint32_t x89;
+  uint32_t x90;
+  uint32_t x91;
+  uint32_t x92;
+  uint32_t x93;
+  fiat_p256_uint1 x94;
+  uint32_t x95;
+  fiat_p256_uint1 x96;
+  uint32_t x97;
+  fiat_p256_uint1 x98;
+  uint32_t x99;
+  fiat_p256_uint1 x100;
+  uint32_t x101;
+  fiat_p256_uint1 x102;
+  uint32_t x103;
+  fiat_p256_uint1 x104;
+  uint32_t x105;
+  fiat_p256_uint1 x106;
+  uint32_t x107;
+  fiat_p256_uint1 x108;
+  uint32_t x109;
+  uint32_t x110;
+  fiat_p256_uint1 x111;
+  uint32_t x112;
+  fiat_p256_uint1 x113;
+  uint32_t x114;
+  fiat_p256_uint1 x115;
+  uint32_t x116;
+  fiat_p256_uint1 x117;
+  uint32_t x118;
+  fiat_p256_uint1 x119;
+  uint32_t x120;
+  fiat_p256_uint1 x121;
+  uint32_t x122;
+  fiat_p256_uint1 x123;
+  uint32_t x124;
+  fiat_p256_uint1 x125;
+  uint32_t x126;
+  uint32_t x127;
+  uint32_t x128;
+  uint32_t x129;
+  uint32_t x130;
+  uint32_t x131;
+  uint32_t x132;
+  uint32_t x133;
+  fiat_p256_uint1 x134;
+  uint32_t x135;
+  uint32_t x136;
+  uint32_t x137;
+  uint32_t x138;
+  uint32_t x139;
+  uint32_t x140;
+  uint32_t x141;
+  uint32_t x142;
+  uint32_t x143;
+  uint32_t x144;
+  fiat_p256_uint1 x145;
+  uint32_t x146;
+  fiat_p256_uint1 x147;
+  uint32_t x148;
+  fiat_p256_uint1 x149;
+  uint32_t x150;
+  fiat_p256_uint1 x151;
+  uint32_t x152;
+  fiat_p256_uint1 x153;
+  uint32_t x154;
+  fiat_p256_uint1 x155;
+  uint32_t x156;
+  fiat_p256_uint1 x157;
+  uint32_t x158;
+  fiat_p256_uint1 x159;
+  uint32_t x160;
+  fiat_p256_uint1 x161;
+  uint32_t x162;
+  uint32_t x163;
+  uint32_t x164;
+  uint32_t x165;
+  uint32_t x166;
+  uint32_t x167;
+  uint32_t x168;
+  uint32_t x169;
+  uint32_t x170;
+  fiat_p256_uint1 x171;
+  uint32_t x172;
+  fiat_p256_uint1 x173;
+  uint32_t x174;
+  fiat_p256_uint1 x175;
+  uint32_t x176;
+  fiat_p256_uint1 x177;
+  uint32_t x178;
+  fiat_p256_uint1 x179;
+  uint32_t x180;
+  fiat_p256_uint1 x181;
+  uint32_t x182;
+  fiat_p256_uint1 x183;
+  uint32_t x184;
+  fiat_p256_uint1 x185;
+  uint32_t x186;
+  fiat_p256_uint1 x187;
+  uint32_t x188;
+  fiat_p256_uint1 x189;
+  uint32_t x190;
+  fiat_p256_uint1 x191;
+  uint32_t x192;
+  fiat_p256_uint1 x193;
+  uint32_t x194;
+  fiat_p256_uint1 x195;
+  uint32_t x196;
+  fiat_p256_uint1 x197;
+  uint32_t x198;
+  fiat_p256_uint1 x199;
+  uint32_t x200;
+  fiat_p256_uint1 x201;
+  uint32_t x202;
+  fiat_p256_uint1 x203;
+  uint32_t x204;
+  fiat_p256_uint1 x205;
+  uint32_t x206;
+  uint32_t x207;
+  uint32_t x208;
+  uint32_t x209;
+  uint32_t x210;
+  uint32_t x211;
+  uint32_t x212;
+  uint32_t x213;
+  uint32_t x214;
+  uint32_t x215;
+  uint32_t x216;
+  uint32_t x217;
+  uint32_t x218;
+  uint32_t x219;
+  uint32_t x220;
+  uint32_t x221;
+  uint32_t x222;
+  uint32_t x223;
+  uint32_t x224;
+  uint32_t x225;
+  uint32_t x226;
+  uint32_t x227;
+  uint32_t x228;
+  uint32_t x229;
+  uint32_t x230;
+  fiat_p256_addcarryx_u32(&x1, &x2, 0x0, (~arg1), 0x1);
+  x3 = (fiat_p256_uint1)((fiat_p256_uint1)(x1 >> 31) & (fiat_p256_uint1)((arg3[0]) & 0x1));
+  fiat_p256_addcarryx_u32(&x4, &x5, 0x0, (~arg1), 0x1);
+  fiat_p256_cmovznz_u32(&x6, x3, arg1, x4);
+  fiat_p256_cmovznz_u32(&x7, x3, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u32(&x8, x3, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u32(&x9, x3, (arg2[2]), (arg3[2]));
+  fiat_p256_cmovznz_u32(&x10, x3, (arg2[3]), (arg3[3]));
+  fiat_p256_cmovznz_u32(&x11, x3, (arg2[4]), (arg3[4]));
+  fiat_p256_cmovznz_u32(&x12, x3, (arg2[5]), (arg3[5]));
+  fiat_p256_cmovznz_u32(&x13, x3, (arg2[6]), (arg3[6]));
+  fiat_p256_cmovznz_u32(&x14, x3, (arg2[7]), (arg3[7]));
+  fiat_p256_cmovznz_u32(&x15, x3, (arg2[8]), (arg3[8]));
+  fiat_p256_addcarryx_u32(&x16, &x17, 0x0, 0x1, (~(arg2[0])));
+  fiat_p256_addcarryx_u32(&x18, &x19, x17, 0x0, (~(arg2[1])));
+  fiat_p256_addcarryx_u32(&x20, &x21, x19, 0x0, (~(arg2[2])));
+  fiat_p256_addcarryx_u32(&x22, &x23, x21, 0x0, (~(arg2[3])));
+  fiat_p256_addcarryx_u32(&x24, &x25, x23, 0x0, (~(arg2[4])));
+  fiat_p256_addcarryx_u32(&x26, &x27, x25, 0x0, (~(arg2[5])));
+  fiat_p256_addcarryx_u32(&x28, &x29, x27, 0x0, (~(arg2[6])));
+  fiat_p256_addcarryx_u32(&x30, &x31, x29, 0x0, (~(arg2[7])));
+  fiat_p256_addcarryx_u32(&x32, &x33, x31, 0x0, (~(arg2[8])));
+  fiat_p256_cmovznz_u32(&x34, x3, (arg3[0]), x16);
+  fiat_p256_cmovznz_u32(&x35, x3, (arg3[1]), x18);
+  fiat_p256_cmovznz_u32(&x36, x3, (arg3[2]), x20);
+  fiat_p256_cmovznz_u32(&x37, x3, (arg3[3]), x22);
+  fiat_p256_cmovznz_u32(&x38, x3, (arg3[4]), x24);
+  fiat_p256_cmovznz_u32(&x39, x3, (arg3[5]), x26);
+  fiat_p256_cmovznz_u32(&x40, x3, (arg3[6]), x28);
+  fiat_p256_cmovznz_u32(&x41, x3, (arg3[7]), x30);
+  fiat_p256_cmovznz_u32(&x42, x3, (arg3[8]), x32);
+  fiat_p256_cmovznz_u32(&x43, x3, (arg4[0]), (arg5[0]));
+  fiat_p256_cmovznz_u32(&x44, x3, (arg4[1]), (arg5[1]));
+  fiat_p256_cmovznz_u32(&x45, x3, (arg4[2]), (arg5[2]));
+  fiat_p256_cmovznz_u32(&x46, x3, (arg4[3]), (arg5[3]));
+  fiat_p256_cmovznz_u32(&x47, x3, (arg4[4]), (arg5[4]));
+  fiat_p256_cmovznz_u32(&x48, x3, (arg4[5]), (arg5[5]));
+  fiat_p256_cmovznz_u32(&x49, x3, (arg4[6]), (arg5[6]));
+  fiat_p256_cmovznz_u32(&x50, x3, (arg4[7]), (arg5[7]));
+  fiat_p256_addcarryx_u32(&x51, &x52, 0x0, x43, x43);
+  fiat_p256_addcarryx_u32(&x53, &x54, x52, x44, x44);
+  fiat_p256_addcarryx_u32(&x55, &x56, x54, x45, x45);
+  fiat_p256_addcarryx_u32(&x57, &x58, x56, x46, x46);
+  fiat_p256_addcarryx_u32(&x59, &x60, x58, x47, x47);
+  fiat_p256_addcarryx_u32(&x61, &x62, x60, x48, x48);
+  fiat_p256_addcarryx_u32(&x63, &x64, x62, x49, x49);
+  fiat_p256_addcarryx_u32(&x65, &x66, x64, x50, x50);
+  fiat_p256_subborrowx_u32(&x67, &x68, 0x0, x51, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x69, &x70, x68, x53, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x71, &x72, x70, x55, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x73, &x74, x72, x57, 0x0);
+  fiat_p256_subborrowx_u32(&x75, &x76, x74, x59, 0x0);
+  fiat_p256_subborrowx_u32(&x77, &x78, x76, x61, 0x0);
+  fiat_p256_subborrowx_u32(&x79, &x80, x78, x63, 0x1);
+  fiat_p256_subborrowx_u32(&x81, &x82, x80, x65, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x83, &x84, x82, x66, 0x0);
+  x85 = (arg4[7]);
+  x86 = (arg4[6]);
+  x87 = (arg4[5]);
+  x88 = (arg4[4]);
+  x89 = (arg4[3]);
+  x90 = (arg4[2]);
+  x91 = (arg4[1]);
+  x92 = (arg4[0]);
+  fiat_p256_subborrowx_u32(&x93, &x94, 0x0, 0x0, x92);
+  fiat_p256_subborrowx_u32(&x95, &x96, x94, 0x0, x91);
+  fiat_p256_subborrowx_u32(&x97, &x98, x96, 0x0, x90);
+  fiat_p256_subborrowx_u32(&x99, &x100, x98, 0x0, x89);
+  fiat_p256_subborrowx_u32(&x101, &x102, x100, 0x0, x88);
+  fiat_p256_subborrowx_u32(&x103, &x104, x102, 0x0, x87);
+  fiat_p256_subborrowx_u32(&x105, &x106, x104, 0x0, x86);
+  fiat_p256_subborrowx_u32(&x107, &x108, x106, 0x0, x85);
+  fiat_p256_cmovznz_u32(&x109, x108, 0x0, UINT32_C(0xffffffff));
+  fiat_p256_addcarryx_u32(&x110, &x111, 0x0, x93, x109);
+  fiat_p256_addcarryx_u32(&x112, &x113, x111, x95, x109);
+  fiat_p256_addcarryx_u32(&x114, &x115, x113, x97, x109);
+  fiat_p256_addcarryx_u32(&x116, &x117, x115, x99, 0x0);
+  fiat_p256_addcarryx_u32(&x118, &x119, x117, x101, 0x0);
+  fiat_p256_addcarryx_u32(&x120, &x121, x119, x103, 0x0);
+  fiat_p256_addcarryx_u32(&x122, &x123, x121, x105, (fiat_p256_uint1)(x109 & 0x1));
+  fiat_p256_addcarryx_u32(&x124, &x125, x123, x107, x109);
+  fiat_p256_cmovznz_u32(&x126, x3, (arg5[0]), x110);
+  fiat_p256_cmovznz_u32(&x127, x3, (arg5[1]), x112);
+  fiat_p256_cmovznz_u32(&x128, x3, (arg5[2]), x114);
+  fiat_p256_cmovznz_u32(&x129, x3, (arg5[3]), x116);
+  fiat_p256_cmovznz_u32(&x130, x3, (arg5[4]), x118);
+  fiat_p256_cmovznz_u32(&x131, x3, (arg5[5]), x120);
+  fiat_p256_cmovznz_u32(&x132, x3, (arg5[6]), x122);
+  fiat_p256_cmovznz_u32(&x133, x3, (arg5[7]), x124);
+  x134 = (fiat_p256_uint1)(x34 & 0x1);
+  fiat_p256_cmovznz_u32(&x135, x134, 0x0, x7);
+  fiat_p256_cmovznz_u32(&x136, x134, 0x0, x8);
+  fiat_p256_cmovznz_u32(&x137, x134, 0x0, x9);
+  fiat_p256_cmovznz_u32(&x138, x134, 0x0, x10);
+  fiat_p256_cmovznz_u32(&x139, x134, 0x0, x11);
+  fiat_p256_cmovznz_u32(&x140, x134, 0x0, x12);
+  fiat_p256_cmovznz_u32(&x141, x134, 0x0, x13);
+  fiat_p256_cmovznz_u32(&x142, x134, 0x0, x14);
+  fiat_p256_cmovznz_u32(&x143, x134, 0x0, x15);
+  fiat_p256_addcarryx_u32(&x144, &x145, 0x0, x34, x135);
+  fiat_p256_addcarryx_u32(&x146, &x147, x145, x35, x136);
+  fiat_p256_addcarryx_u32(&x148, &x149, x147, x36, x137);
+  fiat_p256_addcarryx_u32(&x150, &x151, x149, x37, x138);
+  fiat_p256_addcarryx_u32(&x152, &x153, x151, x38, x139);
+  fiat_p256_addcarryx_u32(&x154, &x155, x153, x39, x140);
+  fiat_p256_addcarryx_u32(&x156, &x157, x155, x40, x141);
+  fiat_p256_addcarryx_u32(&x158, &x159, x157, x41, x142);
+  fiat_p256_addcarryx_u32(&x160, &x161, x159, x42, x143);
+  fiat_p256_cmovznz_u32(&x162, x134, 0x0, x43);
+  fiat_p256_cmovznz_u32(&x163, x134, 0x0, x44);
+  fiat_p256_cmovznz_u32(&x164, x134, 0x0, x45);
+  fiat_p256_cmovznz_u32(&x165, x134, 0x0, x46);
+  fiat_p256_cmovznz_u32(&x166, x134, 0x0, x47);
+  fiat_p256_cmovznz_u32(&x167, x134, 0x0, x48);
+  fiat_p256_cmovznz_u32(&x168, x134, 0x0, x49);
+  fiat_p256_cmovznz_u32(&x169, x134, 0x0, x50);
+  fiat_p256_addcarryx_u32(&x170, &x171, 0x0, x126, x162);
+  fiat_p256_addcarryx_u32(&x172, &x173, x171, x127, x163);
+  fiat_p256_addcarryx_u32(&x174, &x175, x173, x128, x164);
+  fiat_p256_addcarryx_u32(&x176, &x177, x175, x129, x165);
+  fiat_p256_addcarryx_u32(&x178, &x179, x177, x130, x166);
+  fiat_p256_addcarryx_u32(&x180, &x181, x179, x131, x167);
+  fiat_p256_addcarryx_u32(&x182, &x183, x181, x132, x168);
+  fiat_p256_addcarryx_u32(&x184, &x185, x183, x133, x169);
+  fiat_p256_subborrowx_u32(&x186, &x187, 0x0, x170, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x188, &x189, x187, x172, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x190, &x191, x189, x174, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x192, &x193, x191, x176, 0x0);
+  fiat_p256_subborrowx_u32(&x194, &x195, x193, x178, 0x0);
+  fiat_p256_subborrowx_u32(&x196, &x197, x195, x180, 0x0);
+  fiat_p256_subborrowx_u32(&x198, &x199, x197, x182, 0x1);
+  fiat_p256_subborrowx_u32(&x200, &x201, x199, x184, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u32(&x202, &x203, x201, x185, 0x0);
+  fiat_p256_addcarryx_u32(&x204, &x205, 0x0, x6, 0x1);
+  x206 = ((x144 >> 1) | ((x146 << 31) & UINT32_C(0xffffffff)));
+  x207 = ((x146 >> 1) | ((x148 << 31) & UINT32_C(0xffffffff)));
+  x208 = ((x148 >> 1) | ((x150 << 31) & UINT32_C(0xffffffff)));
+  x209 = ((x150 >> 1) | ((x152 << 31) & UINT32_C(0xffffffff)));
+  x210 = ((x152 >> 1) | ((x154 << 31) & UINT32_C(0xffffffff)));
+  x211 = ((x154 >> 1) | ((x156 << 31) & UINT32_C(0xffffffff)));
+  x212 = ((x156 >> 1) | ((x158 << 31) & UINT32_C(0xffffffff)));
+  x213 = ((x158 >> 1) | ((x160 << 31) & UINT32_C(0xffffffff)));
+  x214 = ((x160 & UINT32_C(0x80000000)) | (x160 >> 1));
+  fiat_p256_cmovznz_u32(&x215, x84, x67, x51);
+  fiat_p256_cmovznz_u32(&x216, x84, x69, x53);
+  fiat_p256_cmovznz_u32(&x217, x84, x71, x55);
+  fiat_p256_cmovznz_u32(&x218, x84, x73, x57);
+  fiat_p256_cmovznz_u32(&x219, x84, x75, x59);
+  fiat_p256_cmovznz_u32(&x220, x84, x77, x61);
+  fiat_p256_cmovznz_u32(&x221, x84, x79, x63);
+  fiat_p256_cmovznz_u32(&x222, x84, x81, x65);
+  fiat_p256_cmovznz_u32(&x223, x203, x186, x170);
+  fiat_p256_cmovznz_u32(&x224, x203, x188, x172);
+  fiat_p256_cmovznz_u32(&x225, x203, x190, x174);
+  fiat_p256_cmovznz_u32(&x226, x203, x192, x176);
+  fiat_p256_cmovznz_u32(&x227, x203, x194, x178);
+  fiat_p256_cmovznz_u32(&x228, x203, x196, x180);
+  fiat_p256_cmovznz_u32(&x229, x203, x198, x182);
+  fiat_p256_cmovznz_u32(&x230, x203, x200, x184);
+  *out1 = x204;
+  out2[0] = x7;
+  out2[1] = x8;
+  out2[2] = x9;
+  out2[3] = x10;
+  out2[4] = x11;
+  out2[5] = x12;
+  out2[6] = x13;
+  out2[7] = x14;
+  out2[8] = x15;
+  out3[0] = x206;
+  out3[1] = x207;
+  out3[2] = x208;
+  out3[3] = x209;
+  out3[4] = x210;
+  out3[5] = x211;
+  out3[6] = x212;
+  out3[7] = x213;
+  out3[8] = x214;
+  out4[0] = x215;
+  out4[1] = x216;
+  out4[2] = x217;
+  out4[3] = x218;
+  out4[4] = x219;
+  out4[5] = x220;
+  out4[6] = x221;
+  out4[7] = x222;
+  out5[0] = x223;
+  out5[1] = x224;
+  out5[2] = x225;
+  out5[3] = x226;
+  out5[4] = x227;
+  out5[5] = x228;
+  out5[6] = x229;
+  out5[7] = x230;
+}
+
+/*
+ * The function fiat_p256_divstep_precomp returns the precomputed value for Bernstein-Yang-inversion (in montgomery form).
+ *
+ * Postconditions:
+ *   eval (from_montgomery out1) = ⌊(m - 1) / 2⌋^(if ⌊log2 m⌋ + 1 < 46 then ⌊(49 * (⌊log2 m⌋ + 1) + 80) / 17⌋ else ⌊(49 * (⌊log2 m⌋ + 1) + 57) / 17⌋)
+ *   0 ≤ eval out1 < m
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff], [0x0 ~> 0xffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_divstep_precomp(uint32_t out1[8]) {
+  out1[0] = UINT32_C(0xb8000000);
+  out1[1] = UINT32_C(0x67ffffff);
+  out1[2] = UINT32_C(0x38000000);
+  out1[3] = UINT32_C(0xc0000000);
+  out1[4] = UINT32_C(0x7fffffff);
+  out1[5] = UINT32_C(0xd8000000);
+  out1[6] = UINT32_C(0xffffffff);
+  out1[7] = UINT32_C(0x2fffffff);
+}
diff --git a/third_party/fiat/p256_64.h b/third_party/fiat/p256_64.h
index 773266a..c772638 100644
--- a/third_party/fiat/p256_64.h
+++ b/third_party/fiat/p256_64.h
@@ -1,8 +1,8 @@
-/* Autogenerated: src/ExtractionOCaml/word_by_word_montgomery --static p256 '2^256 - 2^224 + 2^192 + 2^96 - 1' 64 mul square add sub opp from_montgomery nonzero selectznz to_bytes from_bytes */
+/* Autogenerated: 'src/ExtractionOCaml/word_by_word_montgomery' --inline --static --use-value-barrier p256 64 '2^256 - 2^224 + 2^192 + 2^96 - 1' mul square add sub opp from_montgomery to_montgomery nonzero selectznz to_bytes from_bytes one msat divstep divstep_precomp */
 /* curve description: p256 */
-/* requested operations: mul, square, add, sub, opp, from_montgomery, nonzero, selectznz, to_bytes, from_bytes */
-/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */
 /* machine_wordsize = 64 (from "64") */
+/* requested operations: mul, square, add, sub, opp, from_montgomery, to_montgomery, nonzero, selectznz, to_bytes, from_bytes, one, msat, divstep, divstep_precomp */
+/* m = 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff (from "2^256 - 2^224 + 2^192 + 2^96 - 1") */
 /*                                                                    */
 /* NOTE: In addition to the bounds specified above each function, all */
 /*   functions synthesized for this Montgomery arithmetic require the */
@@ -10,20 +10,52 @@
 /*   require the input to be in the unique saturated representation.  */
 /*   All functions also ensure that these two properties are true of  */
 /*   return values.                                                   */
+/*  */
+/* Computed values: */
+/*   eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) */
+/*   bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) */
+/*   twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in */
+/*                            if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 */
 
 #include <stdint.h>
 typedef unsigned char fiat_p256_uint1;
 typedef signed char fiat_p256_int1;
-typedef signed __int128 fiat_p256_int128;
-typedef unsigned __int128 fiat_p256_uint128;
+#if defined(__GNUC__) || defined(__clang__)
+#  define FIAT_P256_FIAT_EXTENSION __extension__
+#  define FIAT_P256_FIAT_INLINE __inline__
+#else
+#  define FIAT_P256_FIAT_EXTENSION
+#  define FIAT_P256_FIAT_INLINE
+#endif
+
+FIAT_P256_FIAT_EXTENSION typedef signed __int128 fiat_p256_int128;
+FIAT_P256_FIAT_EXTENSION typedef unsigned __int128 fiat_p256_uint128;
+
+/* The type fiat_p256_montgomery_domain_field_element is a field element in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */
+typedef uint64_t fiat_p256_montgomery_domain_field_element[4];
+
+/* The type fiat_p256_non_montgomery_domain_field_element is a field element NOT in the Montgomery domain. */
+/* Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] */
+typedef uint64_t fiat_p256_non_montgomery_domain_field_element[4];
 
 #if (-1 & 3) != 3
 #error "This code only works on a two's complement system"
 #endif
 
+#if !defined(FIAT_P256_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+static __inline__ uint64_t fiat_p256_value_barrier_u64(uint64_t a) {
+  __asm__("" : "+r"(a) : /* no inputs */);
+  return a;
+}
+#else
+#  define fiat_p256_value_barrier_u64(x) (x)
+#endif
+
 
 /*
  * The function fiat_p256_addcarryx_u64 is an addition with carry.
+ *
  * Postconditions:
  *   out1 = (arg1 + arg2 + arg3) mod 2^64
  *   out2 = ⌊(arg1 + arg2 + arg3) / 2^64⌋
@@ -36,16 +68,20 @@
  *   out1: [0x0 ~> 0xffffffffffffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
-  fiat_p256_uint128 x1 = ((arg1 + (fiat_p256_uint128)arg2) + arg3);
-  uint64_t x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
-  fiat_p256_uint1 x3 = (fiat_p256_uint1)(x1 >> 64);
+static FIAT_P256_FIAT_INLINE void fiat_p256_addcarryx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_p256_uint128 x1;
+  uint64_t x2;
+  fiat_p256_uint1 x3;
+  x1 = ((arg1 + (fiat_p256_uint128)arg2) + arg3);
+  x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
+  x3 = (fiat_p256_uint1)(x1 >> 64);
   *out1 = x2;
   *out2 = x3;
 }
 
 /*
  * The function fiat_p256_subborrowx_u64 is a subtraction with borrow.
+ *
  * Postconditions:
  *   out1 = (-arg1 + arg2 + -arg3) mod 2^64
  *   out2 = -⌊(-arg1 + arg2 + -arg3) / 2^64⌋
@@ -58,16 +94,20 @@
  *   out1: [0x0 ~> 0xffffffffffffffff]
  *   out2: [0x0 ~> 0x1]
  */
-static void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
-  fiat_p256_int128 x1 = ((arg2 - (fiat_p256_int128)arg1) - arg3);
-  fiat_p256_int1 x2 = (fiat_p256_int1)(x1 >> 64);
-  uint64_t x3 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
+static FIAT_P256_FIAT_INLINE void fiat_p256_subborrowx_u64(uint64_t* out1, fiat_p256_uint1* out2, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_p256_int128 x1;
+  fiat_p256_int1 x2;
+  uint64_t x3;
+  x1 = ((arg2 - (fiat_p256_int128)arg1) - arg3);
+  x2 = (fiat_p256_int1)(x1 >> 64);
+  x3 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
   *out1 = x3;
   *out2 = (fiat_p256_uint1)(0x0 - x2);
 }
 
 /*
  * The function fiat_p256_mulx_u64 is a multiplication, returning the full double-width result.
+ *
  * Postconditions:
  *   out1 = (arg1 * arg2) mod 2^64
  *   out2 = ⌊arg1 * arg2 / 2^64⌋
@@ -79,16 +119,20 @@
  *   out1: [0x0 ~> 0xffffffffffffffff]
  *   out2: [0x0 ~> 0xffffffffffffffff]
  */
-static void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) {
-  fiat_p256_uint128 x1 = ((fiat_p256_uint128)arg1 * arg2);
-  uint64_t x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
-  uint64_t x3 = (uint64_t)(x1 >> 64);
+static FIAT_P256_FIAT_INLINE void fiat_p256_mulx_u64(uint64_t* out1, uint64_t* out2, uint64_t arg1, uint64_t arg2) {
+  fiat_p256_uint128 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = ((fiat_p256_uint128)arg1 * arg2);
+  x2 = (uint64_t)(x1 & UINT64_C(0xffffffffffffffff));
+  x3 = (uint64_t)(x1 >> 64);
   *out1 = x2;
   *out2 = x3;
 }
 
 /*
  * The function fiat_p256_cmovznz_u64 is a single-word conditional move.
+ *
  * Postconditions:
  *   out1 = (if arg1 = 0 then arg2 else arg3)
  *
@@ -99,21 +143,19 @@
  * Output Bounds:
  *   out1: [0x0 ~> 0xffffffffffffffff]
  */
-static void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
-  fiat_p256_uint1 x1 = (!(!arg1));
-  uint64_t x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
-  // Note this line has been patched from the synthesized code to add value
-  // barriers.
-  //
-  // Clang recognizes this pattern as a select. While it usually transforms it
-  // to a cmov, it sometimes further transforms it into a branch, which we do
-  // not want.
-  uint64_t x3 = ((value_barrier_u64(x2) & arg3) | (value_barrier_u64(~x2) & arg2));
+static FIAT_P256_FIAT_INLINE void fiat_p256_cmovznz_u64(uint64_t* out1, fiat_p256_uint1 arg1, uint64_t arg2, uint64_t arg3) {
+  fiat_p256_uint1 x1;
+  uint64_t x2;
+  uint64_t x3;
+  x1 = (!(!arg1));
+  x2 = ((fiat_p256_int1)(0x0 - x1) & UINT64_C(0xffffffffffffffff));
+  x3 = ((fiat_p256_value_barrier_u64(x2) & arg3) | (fiat_p256_value_barrier_u64((~x2)) & arg2));
   *out1 = x3;
 }
 
 /*
  * The function fiat_p256_mul multiplies two field elements in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  *   0 ≤ eval arg2 < m
@@ -121,287 +163,297 @@
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_mul(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
-  uint64_t x1 = (arg1[1]);
-  uint64_t x2 = (arg1[2]);
-  uint64_t x3 = (arg1[3]);
-  uint64_t x4 = (arg1[0]);
+static FIAT_P256_FIAT_INLINE void fiat_p256_mul(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
   uint64_t x5;
   uint64_t x6;
-  fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3]));
   uint64_t x7;
   uint64_t x8;
-  fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2]));
   uint64_t x9;
   uint64_t x10;
-  fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1]));
   uint64_t x11;
   uint64_t x12;
-  fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0]));
   uint64_t x13;
   fiat_p256_uint1 x14;
-  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
   uint64_t x15;
   fiat_p256_uint1 x16;
-  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
   uint64_t x17;
   fiat_p256_uint1 x18;
-  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
-  uint64_t x19 = (x18 + x6);
+  uint64_t x19;
   uint64_t x20;
   uint64_t x21;
-  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
   uint64_t x22;
   uint64_t x23;
-  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
   uint64_t x24;
   uint64_t x25;
-  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
   uint64_t x26;
   fiat_p256_uint1 x27;
-  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
-  uint64_t x28 = (x27 + x23);
+  uint64_t x28;
   uint64_t x29;
   fiat_p256_uint1 x30;
-  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
   uint64_t x31;
   fiat_p256_uint1 x32;
-  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
   uint64_t x33;
   fiat_p256_uint1 x34;
-  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
   uint64_t x35;
   fiat_p256_uint1 x36;
-  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
   uint64_t x37;
   fiat_p256_uint1 x38;
-  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
   uint64_t x39;
   uint64_t x40;
-  fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3]));
   uint64_t x41;
   uint64_t x42;
-  fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2]));
   uint64_t x43;
   uint64_t x44;
-  fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1]));
   uint64_t x45;
   uint64_t x46;
-  fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0]));
   uint64_t x47;
   fiat_p256_uint1 x48;
-  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
   uint64_t x49;
   fiat_p256_uint1 x50;
-  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
   uint64_t x51;
   fiat_p256_uint1 x52;
-  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
-  uint64_t x53 = (x52 + x40);
+  uint64_t x53;
   uint64_t x54;
   fiat_p256_uint1 x55;
-  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
   uint64_t x56;
   fiat_p256_uint1 x57;
-  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
   uint64_t x58;
   fiat_p256_uint1 x59;
-  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
   uint64_t x60;
   fiat_p256_uint1 x61;
-  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
   uint64_t x62;
   fiat_p256_uint1 x63;
-  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
   uint64_t x64;
   uint64_t x65;
-  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
   uint64_t x66;
   uint64_t x67;
-  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
   uint64_t x68;
   uint64_t x69;
-  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
   uint64_t x70;
   fiat_p256_uint1 x71;
-  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
-  uint64_t x72 = (x71 + x67);
+  uint64_t x72;
   uint64_t x73;
   fiat_p256_uint1 x74;
-  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
   uint64_t x75;
   fiat_p256_uint1 x76;
-  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
   uint64_t x77;
   fiat_p256_uint1 x78;
-  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
   uint64_t x79;
   fiat_p256_uint1 x80;
-  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
   uint64_t x81;
   fiat_p256_uint1 x82;
-  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
-  uint64_t x83 = ((uint64_t)x82 + x63);
+  uint64_t x83;
   uint64_t x84;
   uint64_t x85;
-  fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3]));
   uint64_t x86;
   uint64_t x87;
-  fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2]));
   uint64_t x88;
   uint64_t x89;
-  fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1]));
   uint64_t x90;
   uint64_t x91;
-  fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0]));
   uint64_t x92;
   fiat_p256_uint1 x93;
-  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
   uint64_t x94;
   fiat_p256_uint1 x95;
-  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
   uint64_t x96;
   fiat_p256_uint1 x97;
-  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
-  uint64_t x98 = (x97 + x85);
+  uint64_t x98;
   uint64_t x99;
   fiat_p256_uint1 x100;
-  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
   uint64_t x101;
   fiat_p256_uint1 x102;
-  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
   uint64_t x103;
   fiat_p256_uint1 x104;
-  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
   uint64_t x105;
   fiat_p256_uint1 x106;
-  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
   uint64_t x107;
   fiat_p256_uint1 x108;
-  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
   uint64_t x109;
   uint64_t x110;
-  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
   uint64_t x111;
   uint64_t x112;
-  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
   uint64_t x113;
   uint64_t x114;
-  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
   uint64_t x115;
   fiat_p256_uint1 x116;
-  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
-  uint64_t x117 = (x116 + x112);
+  uint64_t x117;
   uint64_t x118;
   fiat_p256_uint1 x119;
-  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
   uint64_t x120;
   fiat_p256_uint1 x121;
-  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
   uint64_t x122;
   fiat_p256_uint1 x123;
-  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
   uint64_t x124;
   fiat_p256_uint1 x125;
-  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
   uint64_t x126;
   fiat_p256_uint1 x127;
-  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
-  uint64_t x128 = ((uint64_t)x127 + x108);
+  uint64_t x128;
   uint64_t x129;
   uint64_t x130;
-  fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3]));
   uint64_t x131;
   uint64_t x132;
-  fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2]));
   uint64_t x133;
   uint64_t x134;
-  fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1]));
   uint64_t x135;
   uint64_t x136;
-  fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0]));
   uint64_t x137;
   fiat_p256_uint1 x138;
-  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
   uint64_t x139;
   fiat_p256_uint1 x140;
-  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
   uint64_t x141;
   fiat_p256_uint1 x142;
-  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
-  uint64_t x143 = (x142 + x130);
+  uint64_t x143;
   uint64_t x144;
   fiat_p256_uint1 x145;
-  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
   uint64_t x146;
   fiat_p256_uint1 x147;
-  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
   uint64_t x148;
   fiat_p256_uint1 x149;
-  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
   uint64_t x150;
   fiat_p256_uint1 x151;
-  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
   uint64_t x152;
   fiat_p256_uint1 x153;
-  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
   uint64_t x154;
   uint64_t x155;
-  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
   uint64_t x156;
   uint64_t x157;
-  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
   uint64_t x158;
   uint64_t x159;
-  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
   uint64_t x160;
   fiat_p256_uint1 x161;
-  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
-  uint64_t x162 = (x161 + x157);
+  uint64_t x162;
   uint64_t x163;
   fiat_p256_uint1 x164;
-  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
   uint64_t x165;
   fiat_p256_uint1 x166;
-  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
   uint64_t x167;
   fiat_p256_uint1 x168;
-  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
   uint64_t x169;
   fiat_p256_uint1 x170;
-  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
   uint64_t x171;
   fiat_p256_uint1 x172;
-  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
-  uint64_t x173 = ((uint64_t)x172 + x153);
+  uint64_t x173;
   uint64_t x174;
   fiat_p256_uint1 x175;
-  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
   uint64_t x176;
   fiat_p256_uint1 x177;
-  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
   uint64_t x178;
   fiat_p256_uint1 x179;
-  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
   uint64_t x180;
   fiat_p256_uint1 x181;
-  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
   uint64_t x182;
   fiat_p256_uint1 x183;
-  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
   uint64_t x184;
-  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
   uint64_t x185;
-  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
   uint64_t x186;
-  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
   uint64_t x187;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, (arg2[3]));
+  fiat_p256_mulx_u64(&x7, &x8, x4, (arg2[2]));
+  fiat_p256_mulx_u64(&x9, &x10, x4, (arg2[1]));
+  fiat_p256_mulx_u64(&x11, &x12, x4, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  x19 = (x18 + x6);
+  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
+  x28 = (x27 + x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
+  fiat_p256_mulx_u64(&x39, &x40, x1, (arg2[3]));
+  fiat_p256_mulx_u64(&x41, &x42, x1, (arg2[2]));
+  fiat_p256_mulx_u64(&x43, &x44, x1, (arg2[1]));
+  fiat_p256_mulx_u64(&x45, &x46, x1, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
+  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
+  x53 = (x52 + x40);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
+  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
+  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
+  x72 = (x71 + x67);
+  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
+  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
+  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
+  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
+  x83 = ((uint64_t)x82 + x63);
+  fiat_p256_mulx_u64(&x84, &x85, x2, (arg2[3]));
+  fiat_p256_mulx_u64(&x86, &x87, x2, (arg2[2]));
+  fiat_p256_mulx_u64(&x88, &x89, x2, (arg2[1]));
+  fiat_p256_mulx_u64(&x90, &x91, x2, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
+  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
+  x98 = (x97 + x85);
+  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
+  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
+  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
+  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
+  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
+  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
+  x117 = (x116 + x112);
+  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
+  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
+  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
+  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
+  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
+  x128 = ((uint64_t)x127 + x108);
+  fiat_p256_mulx_u64(&x129, &x130, x3, (arg2[3]));
+  fiat_p256_mulx_u64(&x131, &x132, x3, (arg2[2]));
+  fiat_p256_mulx_u64(&x133, &x134, x3, (arg2[1]));
+  fiat_p256_mulx_u64(&x135, &x136, x3, (arg2[0]));
+  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
+  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
+  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
+  x143 = (x142 + x130);
+  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
+  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
+  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
+  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
+  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
+  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
+  x162 = (x161 + x157);
+  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
+  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
+  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
+  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
+  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
+  x173 = ((uint64_t)x172 + x153);
+  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
+  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
+  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
+  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
+  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
   fiat_p256_cmovznz_u64(&x187, x183, x180, x171);
   out1[0] = x184;
   out1[1] = x185;
@@ -411,292 +463,304 @@
 
 /*
  * The function fiat_p256_square squares a field element in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_square(uint64_t out1[4], const uint64_t arg1[4]) {
-  uint64_t x1 = (arg1[1]);
-  uint64_t x2 = (arg1[2]);
-  uint64_t x3 = (arg1[3]);
-  uint64_t x4 = (arg1[0]);
+static FIAT_P256_FIAT_INLINE void fiat_p256_square(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
   uint64_t x5;
   uint64_t x6;
-  fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3]));
   uint64_t x7;
   uint64_t x8;
-  fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2]));
   uint64_t x9;
   uint64_t x10;
-  fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1]));
   uint64_t x11;
   uint64_t x12;
-  fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0]));
   uint64_t x13;
   fiat_p256_uint1 x14;
-  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
   uint64_t x15;
   fiat_p256_uint1 x16;
-  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
   uint64_t x17;
   fiat_p256_uint1 x18;
-  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
-  uint64_t x19 = (x18 + x6);
+  uint64_t x19;
   uint64_t x20;
   uint64_t x21;
-  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
   uint64_t x22;
   uint64_t x23;
-  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
   uint64_t x24;
   uint64_t x25;
-  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
   uint64_t x26;
   fiat_p256_uint1 x27;
-  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
-  uint64_t x28 = (x27 + x23);
+  uint64_t x28;
   uint64_t x29;
   fiat_p256_uint1 x30;
-  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
   uint64_t x31;
   fiat_p256_uint1 x32;
-  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
   uint64_t x33;
   fiat_p256_uint1 x34;
-  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
   uint64_t x35;
   fiat_p256_uint1 x36;
-  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
   uint64_t x37;
   fiat_p256_uint1 x38;
-  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
   uint64_t x39;
   uint64_t x40;
-  fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3]));
   uint64_t x41;
   uint64_t x42;
-  fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2]));
   uint64_t x43;
   uint64_t x44;
-  fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1]));
   uint64_t x45;
   uint64_t x46;
-  fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0]));
   uint64_t x47;
   fiat_p256_uint1 x48;
-  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
   uint64_t x49;
   fiat_p256_uint1 x50;
-  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
   uint64_t x51;
   fiat_p256_uint1 x52;
-  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
-  uint64_t x53 = (x52 + x40);
+  uint64_t x53;
   uint64_t x54;
   fiat_p256_uint1 x55;
-  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
   uint64_t x56;
   fiat_p256_uint1 x57;
-  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
   uint64_t x58;
   fiat_p256_uint1 x59;
-  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
   uint64_t x60;
   fiat_p256_uint1 x61;
-  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
   uint64_t x62;
   fiat_p256_uint1 x63;
-  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
   uint64_t x64;
   uint64_t x65;
-  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
   uint64_t x66;
   uint64_t x67;
-  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
   uint64_t x68;
   uint64_t x69;
-  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
   uint64_t x70;
   fiat_p256_uint1 x71;
-  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
-  uint64_t x72 = (x71 + x67);
+  uint64_t x72;
   uint64_t x73;
   fiat_p256_uint1 x74;
-  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
   uint64_t x75;
   fiat_p256_uint1 x76;
-  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
   uint64_t x77;
   fiat_p256_uint1 x78;
-  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
   uint64_t x79;
   fiat_p256_uint1 x80;
-  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
   uint64_t x81;
   fiat_p256_uint1 x82;
-  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
-  uint64_t x83 = ((uint64_t)x82 + x63);
+  uint64_t x83;
   uint64_t x84;
   uint64_t x85;
-  fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3]));
   uint64_t x86;
   uint64_t x87;
-  fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2]));
   uint64_t x88;
   uint64_t x89;
-  fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1]));
   uint64_t x90;
   uint64_t x91;
-  fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0]));
   uint64_t x92;
   fiat_p256_uint1 x93;
-  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
   uint64_t x94;
   fiat_p256_uint1 x95;
-  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
   uint64_t x96;
   fiat_p256_uint1 x97;
-  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
-  uint64_t x98 = (x97 + x85);
+  uint64_t x98;
   uint64_t x99;
   fiat_p256_uint1 x100;
-  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
   uint64_t x101;
   fiat_p256_uint1 x102;
-  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
   uint64_t x103;
   fiat_p256_uint1 x104;
-  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
   uint64_t x105;
   fiat_p256_uint1 x106;
-  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
   uint64_t x107;
   fiat_p256_uint1 x108;
-  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
   uint64_t x109;
   uint64_t x110;
-  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
   uint64_t x111;
   uint64_t x112;
-  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
   uint64_t x113;
   uint64_t x114;
-  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
   uint64_t x115;
   fiat_p256_uint1 x116;
-  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
-  uint64_t x117 = (x116 + x112);
+  uint64_t x117;
   uint64_t x118;
   fiat_p256_uint1 x119;
-  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
   uint64_t x120;
   fiat_p256_uint1 x121;
-  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
   uint64_t x122;
   fiat_p256_uint1 x123;
-  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
   uint64_t x124;
   fiat_p256_uint1 x125;
-  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
   uint64_t x126;
   fiat_p256_uint1 x127;
-  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
-  uint64_t x128 = ((uint64_t)x127 + x108);
+  uint64_t x128;
   uint64_t x129;
   uint64_t x130;
-  fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3]));
   uint64_t x131;
   uint64_t x132;
-  fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2]));
   uint64_t x133;
   uint64_t x134;
-  fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1]));
   uint64_t x135;
   uint64_t x136;
-  fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0]));
   uint64_t x137;
   fiat_p256_uint1 x138;
-  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
   uint64_t x139;
   fiat_p256_uint1 x140;
-  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
   uint64_t x141;
   fiat_p256_uint1 x142;
-  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
-  uint64_t x143 = (x142 + x130);
+  uint64_t x143;
   uint64_t x144;
   fiat_p256_uint1 x145;
-  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
   uint64_t x146;
   fiat_p256_uint1 x147;
-  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
   uint64_t x148;
   fiat_p256_uint1 x149;
-  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
   uint64_t x150;
   fiat_p256_uint1 x151;
-  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
   uint64_t x152;
   fiat_p256_uint1 x153;
-  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
   uint64_t x154;
   uint64_t x155;
-  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
   uint64_t x156;
   uint64_t x157;
-  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
   uint64_t x158;
   uint64_t x159;
-  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
   uint64_t x160;
   fiat_p256_uint1 x161;
-  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
-  uint64_t x162 = (x161 + x157);
+  uint64_t x162;
   uint64_t x163;
   fiat_p256_uint1 x164;
-  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
   uint64_t x165;
   fiat_p256_uint1 x166;
-  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
   uint64_t x167;
   fiat_p256_uint1 x168;
-  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
   uint64_t x169;
   fiat_p256_uint1 x170;
-  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
   uint64_t x171;
   fiat_p256_uint1 x172;
-  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
-  uint64_t x173 = ((uint64_t)x172 + x153);
+  uint64_t x173;
   uint64_t x174;
   fiat_p256_uint1 x175;
-  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
   uint64_t x176;
   fiat_p256_uint1 x177;
-  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
   uint64_t x178;
   fiat_p256_uint1 x179;
-  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
   uint64_t x180;
   fiat_p256_uint1 x181;
-  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
   uint64_t x182;
   fiat_p256_uint1 x183;
-  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
   uint64_t x184;
-  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
   uint64_t x185;
-  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
   uint64_t x186;
-  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
   uint64_t x187;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, (arg1[3]));
+  fiat_p256_mulx_u64(&x7, &x8, x4, (arg1[2]));
+  fiat_p256_mulx_u64(&x9, &x10, x4, (arg1[1]));
+  fiat_p256_mulx_u64(&x11, &x12, x4, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  x19 = (x18 + x6);
+  fiat_p256_mulx_u64(&x20, &x21, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x22, &x23, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x24, &x25, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x26, &x27, 0x0, x25, x22);
+  x28 = (x27 + x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, 0x0, x11, x24);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x13, x26);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x15, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x17, x20);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x19, x21);
+  fiat_p256_mulx_u64(&x39, &x40, x1, (arg1[3]));
+  fiat_p256_mulx_u64(&x41, &x42, x1, (arg1[2]));
+  fiat_p256_mulx_u64(&x43, &x44, x1, (arg1[1]));
+  fiat_p256_mulx_u64(&x45, &x46, x1, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x47, &x48, 0x0, x46, x43);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x44, x41);
+  fiat_p256_addcarryx_u64(&x51, &x52, x50, x42, x39);
+  x53 = (x52 + x40);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x31, x45);
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x33, x47);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x35, x49);
+  fiat_p256_addcarryx_u64(&x60, &x61, x59, x37, x51);
+  fiat_p256_addcarryx_u64(&x62, &x63, x61, x38, x53);
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x66, &x67, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x68, &x69, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x70, &x71, 0x0, x69, x66);
+  x72 = (x71 + x67);
+  fiat_p256_addcarryx_u64(&x73, &x74, 0x0, x54, x68);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, x56, x70);
+  fiat_p256_addcarryx_u64(&x77, &x78, x76, x58, x72);
+  fiat_p256_addcarryx_u64(&x79, &x80, x78, x60, x64);
+  fiat_p256_addcarryx_u64(&x81, &x82, x80, x62, x65);
+  x83 = ((uint64_t)x82 + x63);
+  fiat_p256_mulx_u64(&x84, &x85, x2, (arg1[3]));
+  fiat_p256_mulx_u64(&x86, &x87, x2, (arg1[2]));
+  fiat_p256_mulx_u64(&x88, &x89, x2, (arg1[1]));
+  fiat_p256_mulx_u64(&x90, &x91, x2, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x92, &x93, 0x0, x91, x88);
+  fiat_p256_addcarryx_u64(&x94, &x95, x93, x89, x86);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x87, x84);
+  x98 = (x97 + x85);
+  fiat_p256_addcarryx_u64(&x99, &x100, 0x0, x75, x90);
+  fiat_p256_addcarryx_u64(&x101, &x102, x100, x77, x92);
+  fiat_p256_addcarryx_u64(&x103, &x104, x102, x79, x94);
+  fiat_p256_addcarryx_u64(&x105, &x106, x104, x81, x96);
+  fiat_p256_addcarryx_u64(&x107, &x108, x106, x83, x98);
+  fiat_p256_mulx_u64(&x109, &x110, x99, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x111, &x112, x99, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x113, &x114, x99, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x115, &x116, 0x0, x114, x111);
+  x117 = (x116 + x112);
+  fiat_p256_addcarryx_u64(&x118, &x119, 0x0, x99, x113);
+  fiat_p256_addcarryx_u64(&x120, &x121, x119, x101, x115);
+  fiat_p256_addcarryx_u64(&x122, &x123, x121, x103, x117);
+  fiat_p256_addcarryx_u64(&x124, &x125, x123, x105, x109);
+  fiat_p256_addcarryx_u64(&x126, &x127, x125, x107, x110);
+  x128 = ((uint64_t)x127 + x108);
+  fiat_p256_mulx_u64(&x129, &x130, x3, (arg1[3]));
+  fiat_p256_mulx_u64(&x131, &x132, x3, (arg1[2]));
+  fiat_p256_mulx_u64(&x133, &x134, x3, (arg1[1]));
+  fiat_p256_mulx_u64(&x135, &x136, x3, (arg1[0]));
+  fiat_p256_addcarryx_u64(&x137, &x138, 0x0, x136, x133);
+  fiat_p256_addcarryx_u64(&x139, &x140, x138, x134, x131);
+  fiat_p256_addcarryx_u64(&x141, &x142, x140, x132, x129);
+  x143 = (x142 + x130);
+  fiat_p256_addcarryx_u64(&x144, &x145, 0x0, x120, x135);
+  fiat_p256_addcarryx_u64(&x146, &x147, x145, x122, x137);
+  fiat_p256_addcarryx_u64(&x148, &x149, x147, x124, x139);
+  fiat_p256_addcarryx_u64(&x150, &x151, x149, x126, x141);
+  fiat_p256_addcarryx_u64(&x152, &x153, x151, x128, x143);
+  fiat_p256_mulx_u64(&x154, &x155, x144, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x156, &x157, x144, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x158, &x159, x144, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x160, &x161, 0x0, x159, x156);
+  x162 = (x161 + x157);
+  fiat_p256_addcarryx_u64(&x163, &x164, 0x0, x144, x158);
+  fiat_p256_addcarryx_u64(&x165, &x166, x164, x146, x160);
+  fiat_p256_addcarryx_u64(&x167, &x168, x166, x148, x162);
+  fiat_p256_addcarryx_u64(&x169, &x170, x168, x150, x154);
+  fiat_p256_addcarryx_u64(&x171, &x172, x170, x152, x155);
+  x173 = ((uint64_t)x172 + x153);
+  fiat_p256_subborrowx_u64(&x174, &x175, 0x0, x165, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x176, &x177, x175, x167, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x178, &x179, x177, x169, 0x0);
+  fiat_p256_subborrowx_u64(&x180, &x181, x179, x171, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x182, &x183, x181, x173, 0x0);
+  fiat_p256_cmovznz_u64(&x184, x183, x174, x165);
+  fiat_p256_cmovznz_u64(&x185, x183, x176, x167);
+  fiat_p256_cmovznz_u64(&x186, x183, x178, x169);
   fiat_p256_cmovznz_u64(&x187, x183, x180, x171);
   out1[0] = x184;
   out1[1] = x185;
@@ -706,6 +770,7 @@
 
 /*
  * The function fiat_p256_add adds two field elements in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  *   0 ≤ eval arg2 < m
@@ -713,47 +778,42 @@
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) + eval (from_montgomery arg2)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_add(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_add(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
   uint64_t x1;
   fiat_p256_uint1 x2;
-  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
   uint64_t x3;
   fiat_p256_uint1 x4;
-  fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
   uint64_t x5;
   fiat_p256_uint1 x6;
-  fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
   uint64_t x7;
   fiat_p256_uint1 x8;
-  fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
   uint64_t x9;
   fiat_p256_uint1 x10;
-  fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff));
   uint64_t x11;
   fiat_p256_uint1 x12;
-  fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff));
   uint64_t x13;
   fiat_p256_uint1 x14;
-  fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0);
   uint64_t x15;
   fiat_p256_uint1 x16;
-  fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001));
   uint64_t x17;
   fiat_p256_uint1 x18;
-  fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0);
   uint64_t x19;
-  fiat_p256_cmovznz_u64(&x19, x18, x9, x1);
   uint64_t x20;
-  fiat_p256_cmovznz_u64(&x20, x18, x11, x3);
   uint64_t x21;
-  fiat_p256_cmovznz_u64(&x21, x18, x13, x5);
   uint64_t x22;
+  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_addcarryx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_addcarryx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_addcarryx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_subborrowx_u64(&x9, &x10, 0x0, x1, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x11, &x12, x10, x3, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x13, &x14, x12, x5, 0x0);
+  fiat_p256_subborrowx_u64(&x15, &x16, x14, x7, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x17, &x18, x16, x8, 0x0);
+  fiat_p256_cmovznz_u64(&x19, x18, x9, x1);
+  fiat_p256_cmovznz_u64(&x20, x18, x11, x3);
+  fiat_p256_cmovznz_u64(&x21, x18, x13, x5);
   fiat_p256_cmovznz_u64(&x22, x18, x15, x7);
   out1[0] = x19;
   out1[1] = x20;
@@ -763,6 +823,7 @@
 
 /*
  * The function fiat_p256_sub subtracts two field elements in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  *   0 ≤ eval arg2 < m
@@ -770,38 +831,33 @@
  *   eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) - eval (from_montgomery arg2)) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_sub(uint64_t out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_sub(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1, const fiat_p256_montgomery_domain_field_element arg2) {
   uint64_t x1;
   fiat_p256_uint1 x2;
-  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
   uint64_t x3;
   fiat_p256_uint1 x4;
-  fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
   uint64_t x5;
   fiat_p256_uint1 x6;
-  fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
   uint64_t x7;
   fiat_p256_uint1 x8;
-  fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
   uint64_t x9;
-  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
   uint64_t x10;
   fiat_p256_uint1 x11;
-  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, (x9 & UINT64_C(0xffffffffffffffff)));
   uint64_t x12;
   fiat_p256_uint1 x13;
-  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
   uint64_t x14;
   fiat_p256_uint1 x15;
-  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
   uint64_t x16;
   fiat_p256_uint1 x17;
+  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, (arg1[0]), (arg2[0]));
+  fiat_p256_subborrowx_u64(&x3, &x4, x2, (arg1[1]), (arg2[1]));
+  fiat_p256_subborrowx_u64(&x5, &x6, x4, (arg1[2]), (arg2[2]));
+  fiat_p256_subborrowx_u64(&x7, &x8, x6, (arg1[3]), (arg2[3]));
+  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
   fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001)));
   out1[0] = x10;
   out1[1] = x12;
@@ -811,43 +867,40 @@
 
 /*
  * The function fiat_p256_opp negates a field element in the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
  *   eval (from_montgomery out1) mod m = -eval (from_montgomery arg1) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_opp(uint64_t out1[4], const uint64_t arg1[4]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_opp(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
   uint64_t x1;
   fiat_p256_uint1 x2;
-  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0]));
   uint64_t x3;
   fiat_p256_uint1 x4;
-  fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1]));
   uint64_t x5;
   fiat_p256_uint1 x6;
-  fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2]));
   uint64_t x7;
   fiat_p256_uint1 x8;
-  fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3]));
   uint64_t x9;
-  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
   uint64_t x10;
   fiat_p256_uint1 x11;
-  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, (x9 & UINT64_C(0xffffffffffffffff)));
   uint64_t x12;
   fiat_p256_uint1 x13;
-  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
   uint64_t x14;
   fiat_p256_uint1 x15;
-  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
   uint64_t x16;
   fiat_p256_uint1 x17;
+  fiat_p256_subborrowx_u64(&x1, &x2, 0x0, 0x0, (arg1[0]));
+  fiat_p256_subborrowx_u64(&x3, &x4, x2, 0x0, (arg1[1]));
+  fiat_p256_subborrowx_u64(&x5, &x6, x4, 0x0, (arg1[2]));
+  fiat_p256_subborrowx_u64(&x7, &x8, x6, 0x0, (arg1[3]));
+  fiat_p256_cmovznz_u64(&x9, x8, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x9);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, x3, (x9 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, x5, 0x0);
   fiat_p256_addcarryx_u64(&x16, &x17, x15, x7, (x9 & UINT64_C(0xffffffff00000001)));
   out1[0] = x10;
   out1[1] = x12;
@@ -857,153 +910,152 @@
 
 /*
  * The function fiat_p256_from_montgomery translates a field element out of the Montgomery domain.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
  *   eval out1 mod m = (eval arg1 * ((2^64)⁻¹ mod m)^4) mod m
  *   0 ≤ eval out1 < m
  *
- * Input Bounds:
- *   arg1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
- * Output Bounds:
- *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_from_montgomery(uint64_t out1[4], const uint64_t arg1[4]) {
-  uint64_t x1 = (arg1[0]);
+static FIAT_P256_FIAT_INLINE void fiat_p256_from_montgomery(fiat_p256_non_montgomery_domain_field_element out1, const fiat_p256_montgomery_domain_field_element arg1) {
+  uint64_t x1;
   uint64_t x2;
   uint64_t x3;
-  fiat_p256_mulx_u64(&x2, &x3, x1, UINT64_C(0xffffffff00000001));
   uint64_t x4;
   uint64_t x5;
-  fiat_p256_mulx_u64(&x4, &x5, x1, UINT32_C(0xffffffff));
   uint64_t x6;
   uint64_t x7;
-  fiat_p256_mulx_u64(&x6, &x7, x1, UINT64_C(0xffffffffffffffff));
   uint64_t x8;
   fiat_p256_uint1 x9;
-  fiat_p256_addcarryx_u64(&x8, &x9, 0x0, x7, x4);
   uint64_t x10;
   fiat_p256_uint1 x11;
-  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x6);
   uint64_t x12;
   fiat_p256_uint1 x13;
-  fiat_p256_addcarryx_u64(&x12, &x13, x11, 0x0, x8);
   uint64_t x14;
   fiat_p256_uint1 x15;
-  fiat_p256_addcarryx_u64(&x14, &x15, 0x0, x12, (arg1[1]));
   uint64_t x16;
   uint64_t x17;
-  fiat_p256_mulx_u64(&x16, &x17, x14, UINT64_C(0xffffffff00000001));
   uint64_t x18;
   uint64_t x19;
-  fiat_p256_mulx_u64(&x18, &x19, x14, UINT32_C(0xffffffff));
   uint64_t x20;
   uint64_t x21;
-  fiat_p256_mulx_u64(&x20, &x21, x14, UINT64_C(0xffffffffffffffff));
   uint64_t x22;
   fiat_p256_uint1 x23;
-  fiat_p256_addcarryx_u64(&x22, &x23, 0x0, x21, x18);
   uint64_t x24;
   fiat_p256_uint1 x25;
-  fiat_p256_addcarryx_u64(&x24, &x25, 0x0, x14, x20);
   uint64_t x26;
   fiat_p256_uint1 x27;
-  fiat_p256_addcarryx_u64(&x26, &x27, x25, (x15 + (x13 + (x9 + x5))), x22);
   uint64_t x28;
   fiat_p256_uint1 x29;
-  fiat_p256_addcarryx_u64(&x28, &x29, x27, x2, (x23 + x19));
   uint64_t x30;
   fiat_p256_uint1 x31;
-  fiat_p256_addcarryx_u64(&x30, &x31, x29, x3, x16);
   uint64_t x32;
   fiat_p256_uint1 x33;
-  fiat_p256_addcarryx_u64(&x32, &x33, 0x0, x26, (arg1[2]));
   uint64_t x34;
   fiat_p256_uint1 x35;
-  fiat_p256_addcarryx_u64(&x34, &x35, x33, x28, 0x0);
   uint64_t x36;
   fiat_p256_uint1 x37;
-  fiat_p256_addcarryx_u64(&x36, &x37, x35, x30, 0x0);
   uint64_t x38;
   uint64_t x39;
-  fiat_p256_mulx_u64(&x38, &x39, x32, UINT64_C(0xffffffff00000001));
   uint64_t x40;
   uint64_t x41;
-  fiat_p256_mulx_u64(&x40, &x41, x32, UINT32_C(0xffffffff));
   uint64_t x42;
   uint64_t x43;
-  fiat_p256_mulx_u64(&x42, &x43, x32, UINT64_C(0xffffffffffffffff));
   uint64_t x44;
   fiat_p256_uint1 x45;
-  fiat_p256_addcarryx_u64(&x44, &x45, 0x0, x43, x40);
   uint64_t x46;
   fiat_p256_uint1 x47;
-  fiat_p256_addcarryx_u64(&x46, &x47, 0x0, x32, x42);
   uint64_t x48;
   fiat_p256_uint1 x49;
-  fiat_p256_addcarryx_u64(&x48, &x49, x47, x34, x44);
   uint64_t x50;
   fiat_p256_uint1 x51;
-  fiat_p256_addcarryx_u64(&x50, &x51, x49, x36, (x45 + x41));
   uint64_t x52;
   fiat_p256_uint1 x53;
-  fiat_p256_addcarryx_u64(&x52, &x53, x51, (x37 + (x31 + x17)), x38);
   uint64_t x54;
   fiat_p256_uint1 x55;
-  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x48, (arg1[3]));
   uint64_t x56;
   fiat_p256_uint1 x57;
-  fiat_p256_addcarryx_u64(&x56, &x57, x55, x50, 0x0);
   uint64_t x58;
   fiat_p256_uint1 x59;
-  fiat_p256_addcarryx_u64(&x58, &x59, x57, x52, 0x0);
   uint64_t x60;
   uint64_t x61;
-  fiat_p256_mulx_u64(&x60, &x61, x54, UINT64_C(0xffffffff00000001));
   uint64_t x62;
   uint64_t x63;
-  fiat_p256_mulx_u64(&x62, &x63, x54, UINT32_C(0xffffffff));
   uint64_t x64;
   uint64_t x65;
-  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffffffffffff));
   uint64_t x66;
   fiat_p256_uint1 x67;
-  fiat_p256_addcarryx_u64(&x66, &x67, 0x0, x65, x62);
   uint64_t x68;
   fiat_p256_uint1 x69;
-  fiat_p256_addcarryx_u64(&x68, &x69, 0x0, x54, x64);
   uint64_t x70;
   fiat_p256_uint1 x71;
-  fiat_p256_addcarryx_u64(&x70, &x71, x69, x56, x66);
   uint64_t x72;
   fiat_p256_uint1 x73;
-  fiat_p256_addcarryx_u64(&x72, &x73, x71, x58, (x67 + x63));
   uint64_t x74;
   fiat_p256_uint1 x75;
-  fiat_p256_addcarryx_u64(&x74, &x75, x73, (x59 + (x53 + x39)), x60);
-  uint64_t x76 = (x75 + x61);
+  uint64_t x76;
   uint64_t x77;
   fiat_p256_uint1 x78;
-  fiat_p256_subborrowx_u64(&x77, &x78, 0x0, x70, UINT64_C(0xffffffffffffffff));
   uint64_t x79;
   fiat_p256_uint1 x80;
-  fiat_p256_subborrowx_u64(&x79, &x80, x78, x72, UINT32_C(0xffffffff));
   uint64_t x81;
   fiat_p256_uint1 x82;
-  fiat_p256_subborrowx_u64(&x81, &x82, x80, x74, 0x0);
   uint64_t x83;
   fiat_p256_uint1 x84;
-  fiat_p256_subborrowx_u64(&x83, &x84, x82, x76, UINT64_C(0xffffffff00000001));
   uint64_t x85;
   fiat_p256_uint1 x86;
-  fiat_p256_subborrowx_u64(&x85, &x86, x84, 0x0, 0x0);
   uint64_t x87;
-  fiat_p256_cmovznz_u64(&x87, x86, x77, x70);
   uint64_t x88;
-  fiat_p256_cmovznz_u64(&x88, x86, x79, x72);
   uint64_t x89;
-  fiat_p256_cmovznz_u64(&x89, x86, x81, x74);
   uint64_t x90;
+  x1 = (arg1[0]);
+  fiat_p256_mulx_u64(&x2, &x3, x1, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x4, &x5, x1, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x6, &x7, x1, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x8, &x9, 0x0, x7, x4);
+  fiat_p256_addcarryx_u64(&x10, &x11, 0x0, x1, x6);
+  fiat_p256_addcarryx_u64(&x12, &x13, x11, 0x0, x8);
+  fiat_p256_addcarryx_u64(&x14, &x15, 0x0, x12, (arg1[1]));
+  fiat_p256_mulx_u64(&x16, &x17, x14, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x18, &x19, x14, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x20, &x21, x14, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x22, &x23, 0x0, x21, x18);
+  fiat_p256_addcarryx_u64(&x24, &x25, 0x0, x14, x20);
+  fiat_p256_addcarryx_u64(&x26, &x27, x25, (x15 + (x13 + (x9 + x5))), x22);
+  fiat_p256_addcarryx_u64(&x28, &x29, x27, x2, (x23 + x19));
+  fiat_p256_addcarryx_u64(&x30, &x31, x29, x3, x16);
+  fiat_p256_addcarryx_u64(&x32, &x33, 0x0, x26, (arg1[2]));
+  fiat_p256_addcarryx_u64(&x34, &x35, x33, x28, 0x0);
+  fiat_p256_addcarryx_u64(&x36, &x37, x35, x30, 0x0);
+  fiat_p256_mulx_u64(&x38, &x39, x32, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x40, &x41, x32, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x42, &x43, x32, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x44, &x45, 0x0, x43, x40);
+  fiat_p256_addcarryx_u64(&x46, &x47, 0x0, x32, x42);
+  fiat_p256_addcarryx_u64(&x48, &x49, x47, x34, x44);
+  fiat_p256_addcarryx_u64(&x50, &x51, x49, x36, (x45 + x41));
+  fiat_p256_addcarryx_u64(&x52, &x53, x51, (x37 + (x31 + x17)), x38);
+  fiat_p256_addcarryx_u64(&x54, &x55, 0x0, x48, (arg1[3]));
+  fiat_p256_addcarryx_u64(&x56, &x57, x55, x50, 0x0);
+  fiat_p256_addcarryx_u64(&x58, &x59, x57, x52, 0x0);
+  fiat_p256_mulx_u64(&x60, &x61, x54, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x62, &x63, x54, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x64, &x65, x54, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x66, &x67, 0x0, x65, x62);
+  fiat_p256_addcarryx_u64(&x68, &x69, 0x0, x54, x64);
+  fiat_p256_addcarryx_u64(&x70, &x71, x69, x56, x66);
+  fiat_p256_addcarryx_u64(&x72, &x73, x71, x58, (x67 + x63));
+  fiat_p256_addcarryx_u64(&x74, &x75, x73, (x59 + (x53 + x39)), x60);
+  x76 = (x75 + x61);
+  fiat_p256_subborrowx_u64(&x77, &x78, 0x0, x70, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x79, &x80, x78, x72, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x81, &x82, x80, x74, 0x0);
+  fiat_p256_subborrowx_u64(&x83, &x84, x82, x76, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x85, &x86, x84, 0x0, 0x0);
+  fiat_p256_cmovznz_u64(&x87, x86, x77, x70);
+  fiat_p256_cmovznz_u64(&x88, x86, x79, x72);
+  fiat_p256_cmovznz_u64(&x89, x86, x81, x74);
   fiat_p256_cmovznz_u64(&x90, x86, x83, x76);
   out1[0] = x87;
   out1[1] = x88;
@@ -1012,7 +1064,284 @@
 }
 
 /*
+ * The function fiat_p256_to_montgomery translates a field element into the Montgomery domain.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg1 < m
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = eval arg1 mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_to_montgomery(fiat_p256_montgomery_domain_field_element out1, const fiat_p256_non_montgomery_domain_field_element arg1) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  fiat_p256_uint1 x14;
+  uint64_t x15;
+  fiat_p256_uint1 x16;
+  uint64_t x17;
+  fiat_p256_uint1 x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  fiat_p256_uint1 x26;
+  uint64_t x27;
+  fiat_p256_uint1 x28;
+  uint64_t x29;
+  fiat_p256_uint1 x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  fiat_p256_uint1 x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  fiat_p256_uint1 x50;
+  uint64_t x51;
+  fiat_p256_uint1 x52;
+  uint64_t x53;
+  fiat_p256_uint1 x54;
+  uint64_t x55;
+  fiat_p256_uint1 x56;
+  uint64_t x57;
+  fiat_p256_uint1 x58;
+  uint64_t x59;
+  uint64_t x60;
+  uint64_t x61;
+  uint64_t x62;
+  uint64_t x63;
+  uint64_t x64;
+  uint64_t x65;
+  fiat_p256_uint1 x66;
+  uint64_t x67;
+  fiat_p256_uint1 x68;
+  uint64_t x69;
+  fiat_p256_uint1 x70;
+  uint64_t x71;
+  fiat_p256_uint1 x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  fiat_p256_uint1 x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  uint64_t x81;
+  uint64_t x82;
+  uint64_t x83;
+  uint64_t x84;
+  uint64_t x85;
+  fiat_p256_uint1 x86;
+  uint64_t x87;
+  fiat_p256_uint1 x88;
+  uint64_t x89;
+  fiat_p256_uint1 x90;
+  uint64_t x91;
+  fiat_p256_uint1 x92;
+  uint64_t x93;
+  fiat_p256_uint1 x94;
+  uint64_t x95;
+  fiat_p256_uint1 x96;
+  uint64_t x97;
+  fiat_p256_uint1 x98;
+  uint64_t x99;
+  uint64_t x100;
+  uint64_t x101;
+  uint64_t x102;
+  uint64_t x103;
+  uint64_t x104;
+  uint64_t x105;
+  fiat_p256_uint1 x106;
+  uint64_t x107;
+  fiat_p256_uint1 x108;
+  uint64_t x109;
+  fiat_p256_uint1 x110;
+  uint64_t x111;
+  fiat_p256_uint1 x112;
+  uint64_t x113;
+  fiat_p256_uint1 x114;
+  uint64_t x115;
+  fiat_p256_uint1 x116;
+  uint64_t x117;
+  uint64_t x118;
+  uint64_t x119;
+  uint64_t x120;
+  uint64_t x121;
+  uint64_t x122;
+  uint64_t x123;
+  uint64_t x124;
+  uint64_t x125;
+  fiat_p256_uint1 x126;
+  uint64_t x127;
+  fiat_p256_uint1 x128;
+  uint64_t x129;
+  fiat_p256_uint1 x130;
+  uint64_t x131;
+  fiat_p256_uint1 x132;
+  uint64_t x133;
+  fiat_p256_uint1 x134;
+  uint64_t x135;
+  fiat_p256_uint1 x136;
+  uint64_t x137;
+  fiat_p256_uint1 x138;
+  uint64_t x139;
+  uint64_t x140;
+  uint64_t x141;
+  uint64_t x142;
+  uint64_t x143;
+  uint64_t x144;
+  uint64_t x145;
+  fiat_p256_uint1 x146;
+  uint64_t x147;
+  fiat_p256_uint1 x148;
+  uint64_t x149;
+  fiat_p256_uint1 x150;
+  uint64_t x151;
+  fiat_p256_uint1 x152;
+  uint64_t x153;
+  fiat_p256_uint1 x154;
+  uint64_t x155;
+  fiat_p256_uint1 x156;
+  uint64_t x157;
+  fiat_p256_uint1 x158;
+  uint64_t x159;
+  fiat_p256_uint1 x160;
+  uint64_t x161;
+  fiat_p256_uint1 x162;
+  uint64_t x163;
+  fiat_p256_uint1 x164;
+  uint64_t x165;
+  fiat_p256_uint1 x166;
+  uint64_t x167;
+  uint64_t x168;
+  uint64_t x169;
+  uint64_t x170;
+  x1 = (arg1[1]);
+  x2 = (arg1[2]);
+  x3 = (arg1[3]);
+  x4 = (arg1[0]);
+  fiat_p256_mulx_u64(&x5, &x6, x4, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x7, &x8, x4, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x9, &x10, x4, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x11, &x12, x4, 0x3);
+  fiat_p256_addcarryx_u64(&x13, &x14, 0x0, x12, x9);
+  fiat_p256_addcarryx_u64(&x15, &x16, x14, x10, x7);
+  fiat_p256_addcarryx_u64(&x17, &x18, x16, x8, x5);
+  fiat_p256_mulx_u64(&x19, &x20, x11, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x21, &x22, x11, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x23, &x24, x11, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x25, &x26, 0x0, x24, x21);
+  fiat_p256_addcarryx_u64(&x27, &x28, 0x0, x11, x23);
+  fiat_p256_addcarryx_u64(&x29, &x30, x28, x13, x25);
+  fiat_p256_addcarryx_u64(&x31, &x32, x30, x15, (x26 + x22));
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x17, x19);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, (x18 + x6), x20);
+  fiat_p256_mulx_u64(&x37, &x38, x1, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x39, &x40, x1, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x41, &x42, x1, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x43, &x44, x1, 0x3);
+  fiat_p256_addcarryx_u64(&x45, &x46, 0x0, x44, x41);
+  fiat_p256_addcarryx_u64(&x47, &x48, x46, x42, x39);
+  fiat_p256_addcarryx_u64(&x49, &x50, x48, x40, x37);
+  fiat_p256_addcarryx_u64(&x51, &x52, 0x0, x29, x43);
+  fiat_p256_addcarryx_u64(&x53, &x54, x52, x31, x45);
+  fiat_p256_addcarryx_u64(&x55, &x56, x54, x33, x47);
+  fiat_p256_addcarryx_u64(&x57, &x58, x56, x35, x49);
+  fiat_p256_mulx_u64(&x59, &x60, x51, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x61, &x62, x51, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x63, &x64, x51, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x65, &x66, 0x0, x64, x61);
+  fiat_p256_addcarryx_u64(&x67, &x68, 0x0, x51, x63);
+  fiat_p256_addcarryx_u64(&x69, &x70, x68, x53, x65);
+  fiat_p256_addcarryx_u64(&x71, &x72, x70, x55, (x66 + x62));
+  fiat_p256_addcarryx_u64(&x73, &x74, x72, x57, x59);
+  fiat_p256_addcarryx_u64(&x75, &x76, x74, (((uint64_t)x58 + x36) + (x50 + x38)), x60);
+  fiat_p256_mulx_u64(&x77, &x78, x2, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x79, &x80, x2, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x81, &x82, x2, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x83, &x84, x2, 0x3);
+  fiat_p256_addcarryx_u64(&x85, &x86, 0x0, x84, x81);
+  fiat_p256_addcarryx_u64(&x87, &x88, x86, x82, x79);
+  fiat_p256_addcarryx_u64(&x89, &x90, x88, x80, x77);
+  fiat_p256_addcarryx_u64(&x91, &x92, 0x0, x69, x83);
+  fiat_p256_addcarryx_u64(&x93, &x94, x92, x71, x85);
+  fiat_p256_addcarryx_u64(&x95, &x96, x94, x73, x87);
+  fiat_p256_addcarryx_u64(&x97, &x98, x96, x75, x89);
+  fiat_p256_mulx_u64(&x99, &x100, x91, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x101, &x102, x91, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x103, &x104, x91, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x105, &x106, 0x0, x104, x101);
+  fiat_p256_addcarryx_u64(&x107, &x108, 0x0, x91, x103);
+  fiat_p256_addcarryx_u64(&x109, &x110, x108, x93, x105);
+  fiat_p256_addcarryx_u64(&x111, &x112, x110, x95, (x106 + x102));
+  fiat_p256_addcarryx_u64(&x113, &x114, x112, x97, x99);
+  fiat_p256_addcarryx_u64(&x115, &x116, x114, (((uint64_t)x98 + x76) + (x90 + x78)), x100);
+  fiat_p256_mulx_u64(&x117, &x118, x3, UINT64_C(0x4fffffffd));
+  fiat_p256_mulx_u64(&x119, &x120, x3, UINT64_C(0xfffffffffffffffe));
+  fiat_p256_mulx_u64(&x121, &x122, x3, UINT64_C(0xfffffffbffffffff));
+  fiat_p256_mulx_u64(&x123, &x124, x3, 0x3);
+  fiat_p256_addcarryx_u64(&x125, &x126, 0x0, x124, x121);
+  fiat_p256_addcarryx_u64(&x127, &x128, x126, x122, x119);
+  fiat_p256_addcarryx_u64(&x129, &x130, x128, x120, x117);
+  fiat_p256_addcarryx_u64(&x131, &x132, 0x0, x109, x123);
+  fiat_p256_addcarryx_u64(&x133, &x134, x132, x111, x125);
+  fiat_p256_addcarryx_u64(&x135, &x136, x134, x113, x127);
+  fiat_p256_addcarryx_u64(&x137, &x138, x136, x115, x129);
+  fiat_p256_mulx_u64(&x139, &x140, x131, UINT64_C(0xffffffff00000001));
+  fiat_p256_mulx_u64(&x141, &x142, x131, UINT32_C(0xffffffff));
+  fiat_p256_mulx_u64(&x143, &x144, x131, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x145, &x146, 0x0, x144, x141);
+  fiat_p256_addcarryx_u64(&x147, &x148, 0x0, x131, x143);
+  fiat_p256_addcarryx_u64(&x149, &x150, x148, x133, x145);
+  fiat_p256_addcarryx_u64(&x151, &x152, x150, x135, (x146 + x142));
+  fiat_p256_addcarryx_u64(&x153, &x154, x152, x137, x139);
+  fiat_p256_addcarryx_u64(&x155, &x156, x154, (((uint64_t)x138 + x116) + (x130 + x118)), x140);
+  fiat_p256_subborrowx_u64(&x157, &x158, 0x0, x149, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x159, &x160, x158, x151, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x161, &x162, x160, x153, 0x0);
+  fiat_p256_subborrowx_u64(&x163, &x164, x162, x155, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x165, &x166, x164, x156, 0x0);
+  fiat_p256_cmovznz_u64(&x167, x166, x157, x149);
+  fiat_p256_cmovznz_u64(&x168, x166, x159, x151);
+  fiat_p256_cmovznz_u64(&x169, x166, x161, x153);
+  fiat_p256_cmovznz_u64(&x170, x166, x163, x155);
+  out1[0] = x167;
+  out1[1] = x168;
+  out1[2] = x169;
+  out1[3] = x170;
+}
+
+/*
  * The function fiat_p256_nonzero outputs a single non-zero word if the input is non-zero and zero otherwise.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
@@ -1023,13 +1352,15 @@
  * Output Bounds:
  *   out1: [0x0 ~> 0xffffffffffffffff]
  */
-static void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) {
-  uint64_t x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | ((arg1[3]) | (uint64_t)0x0))));
+static FIAT_P256_FIAT_INLINE void fiat_p256_nonzero(uint64_t* out1, const uint64_t arg1[4]) {
+  uint64_t x1;
+  x1 = ((arg1[0]) | ((arg1[1]) | ((arg1[2]) | (arg1[3]))));
   *out1 = x1;
 }
 
 /*
  * The function fiat_p256_selectznz is a multi-limb conditional select.
+ *
  * Postconditions:
  *   eval out1 = (if arg1 = 0 then eval arg2 else eval arg3)
  *
@@ -1040,14 +1371,14 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) {
+static FIAT_P256_FIAT_INLINE void fiat_p256_selectznz(uint64_t out1[4], fiat_p256_uint1 arg1, const uint64_t arg2[4], const uint64_t arg3[4]) {
   uint64_t x1;
-  fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
   uint64_t x2;
-  fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
   uint64_t x3;
-  fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
   uint64_t x4;
+  fiat_p256_cmovznz_u64(&x1, arg1, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u64(&x2, arg1, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u64(&x3, arg1, (arg2[2]), (arg3[2]));
   fiat_p256_cmovznz_u64(&x4, arg1, (arg2[3]), (arg3[3]));
   out1[0] = x1;
   out1[1] = x2;
@@ -1056,7 +1387,8 @@
 }
 
 /*
- * The function fiat_p256_to_bytes serializes a field element in the Montgomery domain to bytes in little-endian order.
+ * The function fiat_p256_to_bytes serializes a field element NOT in the Montgomery domain to bytes in little-endian order.
+ *
  * Preconditions:
  *   0 ≤ eval arg1 < m
  * Postconditions:
@@ -1067,106 +1399,164 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff], [0x0 ~> 0xff]]
  */
-static void fiat_p256_to_bytes(uint8_t out1[32], const uint64_t arg1[4]) {
-  uint64_t x1 = (arg1[3]);
-  uint64_t x2 = (arg1[2]);
-  uint64_t x3 = (arg1[1]);
-  uint64_t x4 = (arg1[0]);
-  uint64_t x5 = (x4 >> 8);
-  uint8_t x6 = (uint8_t)(x4 & UINT8_C(0xff));
-  uint64_t x7 = (x5 >> 8);
-  uint8_t x8 = (uint8_t)(x5 & UINT8_C(0xff));
-  uint64_t x9 = (x7 >> 8);
-  uint8_t x10 = (uint8_t)(x7 & UINT8_C(0xff));
-  uint64_t x11 = (x9 >> 8);
-  uint8_t x12 = (uint8_t)(x9 & UINT8_C(0xff));
-  uint64_t x13 = (x11 >> 8);
-  uint8_t x14 = (uint8_t)(x11 & UINT8_C(0xff));
-  uint64_t x15 = (x13 >> 8);
-  uint8_t x16 = (uint8_t)(x13 & UINT8_C(0xff));
-  uint8_t x17 = (uint8_t)(x15 >> 8);
-  uint8_t x18 = (uint8_t)(x15 & UINT8_C(0xff));
-  uint8_t x19 = (uint8_t)(x17 & UINT8_C(0xff));
-  uint64_t x20 = (x3 >> 8);
-  uint8_t x21 = (uint8_t)(x3 & UINT8_C(0xff));
-  uint64_t x22 = (x20 >> 8);
-  uint8_t x23 = (uint8_t)(x20 & UINT8_C(0xff));
-  uint64_t x24 = (x22 >> 8);
-  uint8_t x25 = (uint8_t)(x22 & UINT8_C(0xff));
-  uint64_t x26 = (x24 >> 8);
-  uint8_t x27 = (uint8_t)(x24 & UINT8_C(0xff));
-  uint64_t x28 = (x26 >> 8);
-  uint8_t x29 = (uint8_t)(x26 & UINT8_C(0xff));
-  uint64_t x30 = (x28 >> 8);
-  uint8_t x31 = (uint8_t)(x28 & UINT8_C(0xff));
-  uint8_t x32 = (uint8_t)(x30 >> 8);
-  uint8_t x33 = (uint8_t)(x30 & UINT8_C(0xff));
-  uint8_t x34 = (uint8_t)(x32 & UINT8_C(0xff));
-  uint64_t x35 = (x2 >> 8);
-  uint8_t x36 = (uint8_t)(x2 & UINT8_C(0xff));
-  uint64_t x37 = (x35 >> 8);
-  uint8_t x38 = (uint8_t)(x35 & UINT8_C(0xff));
-  uint64_t x39 = (x37 >> 8);
-  uint8_t x40 = (uint8_t)(x37 & UINT8_C(0xff));
-  uint64_t x41 = (x39 >> 8);
-  uint8_t x42 = (uint8_t)(x39 & UINT8_C(0xff));
-  uint64_t x43 = (x41 >> 8);
-  uint8_t x44 = (uint8_t)(x41 & UINT8_C(0xff));
-  uint64_t x45 = (x43 >> 8);
-  uint8_t x46 = (uint8_t)(x43 & UINT8_C(0xff));
-  uint8_t x47 = (uint8_t)(x45 >> 8);
-  uint8_t x48 = (uint8_t)(x45 & UINT8_C(0xff));
-  uint8_t x49 = (uint8_t)(x47 & UINT8_C(0xff));
-  uint64_t x50 = (x1 >> 8);
-  uint8_t x51 = (uint8_t)(x1 & UINT8_C(0xff));
-  uint64_t x52 = (x50 >> 8);
-  uint8_t x53 = (uint8_t)(x50 & UINT8_C(0xff));
-  uint64_t x54 = (x52 >> 8);
-  uint8_t x55 = (uint8_t)(x52 & UINT8_C(0xff));
-  uint64_t x56 = (x54 >> 8);
-  uint8_t x57 = (uint8_t)(x54 & UINT8_C(0xff));
-  uint64_t x58 = (x56 >> 8);
-  uint8_t x59 = (uint8_t)(x56 & UINT8_C(0xff));
-  uint64_t x60 = (x58 >> 8);
-  uint8_t x61 = (uint8_t)(x58 & UINT8_C(0xff));
-  uint8_t x62 = (uint8_t)(x60 >> 8);
-  uint8_t x63 = (uint8_t)(x60 & UINT8_C(0xff));
-  out1[0] = x6;
-  out1[1] = x8;
-  out1[2] = x10;
-  out1[3] = x12;
-  out1[4] = x14;
-  out1[5] = x16;
-  out1[6] = x18;
-  out1[7] = x19;
-  out1[8] = x21;
-  out1[9] = x23;
-  out1[10] = x25;
-  out1[11] = x27;
-  out1[12] = x29;
-  out1[13] = x31;
-  out1[14] = x33;
-  out1[15] = x34;
-  out1[16] = x36;
-  out1[17] = x38;
-  out1[18] = x40;
-  out1[19] = x42;
-  out1[20] = x44;
-  out1[21] = x46;
-  out1[22] = x48;
-  out1[23] = x49;
-  out1[24] = x51;
-  out1[25] = x53;
-  out1[26] = x55;
-  out1[27] = x57;
-  out1[28] = x59;
-  out1[29] = x61;
-  out1[30] = x63;
-  out1[31] = x62;
+static FIAT_P256_FIAT_INLINE void fiat_p256_to_bytes(uint8_t out1[32], const uint64_t arg1[4]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint8_t x5;
+  uint64_t x6;
+  uint8_t x7;
+  uint64_t x8;
+  uint8_t x9;
+  uint64_t x10;
+  uint8_t x11;
+  uint64_t x12;
+  uint8_t x13;
+  uint64_t x14;
+  uint8_t x15;
+  uint64_t x16;
+  uint8_t x17;
+  uint8_t x18;
+  uint8_t x19;
+  uint64_t x20;
+  uint8_t x21;
+  uint64_t x22;
+  uint8_t x23;
+  uint64_t x24;
+  uint8_t x25;
+  uint64_t x26;
+  uint8_t x27;
+  uint64_t x28;
+  uint8_t x29;
+  uint64_t x30;
+  uint8_t x31;
+  uint8_t x32;
+  uint8_t x33;
+  uint64_t x34;
+  uint8_t x35;
+  uint64_t x36;
+  uint8_t x37;
+  uint64_t x38;
+  uint8_t x39;
+  uint64_t x40;
+  uint8_t x41;
+  uint64_t x42;
+  uint8_t x43;
+  uint64_t x44;
+  uint8_t x45;
+  uint8_t x46;
+  uint8_t x47;
+  uint64_t x48;
+  uint8_t x49;
+  uint64_t x50;
+  uint8_t x51;
+  uint64_t x52;
+  uint8_t x53;
+  uint64_t x54;
+  uint8_t x55;
+  uint64_t x56;
+  uint8_t x57;
+  uint64_t x58;
+  uint8_t x59;
+  uint8_t x60;
+  x1 = (arg1[3]);
+  x2 = (arg1[2]);
+  x3 = (arg1[1]);
+  x4 = (arg1[0]);
+  x5 = (uint8_t)(x4 & UINT8_C(0xff));
+  x6 = (x4 >> 8);
+  x7 = (uint8_t)(x6 & UINT8_C(0xff));
+  x8 = (x6 >> 8);
+  x9 = (uint8_t)(x8 & UINT8_C(0xff));
+  x10 = (x8 >> 8);
+  x11 = (uint8_t)(x10 & UINT8_C(0xff));
+  x12 = (x10 >> 8);
+  x13 = (uint8_t)(x12 & UINT8_C(0xff));
+  x14 = (x12 >> 8);
+  x15 = (uint8_t)(x14 & UINT8_C(0xff));
+  x16 = (x14 >> 8);
+  x17 = (uint8_t)(x16 & UINT8_C(0xff));
+  x18 = (uint8_t)(x16 >> 8);
+  x19 = (uint8_t)(x3 & UINT8_C(0xff));
+  x20 = (x3 >> 8);
+  x21 = (uint8_t)(x20 & UINT8_C(0xff));
+  x22 = (x20 >> 8);
+  x23 = (uint8_t)(x22 & UINT8_C(0xff));
+  x24 = (x22 >> 8);
+  x25 = (uint8_t)(x24 & UINT8_C(0xff));
+  x26 = (x24 >> 8);
+  x27 = (uint8_t)(x26 & UINT8_C(0xff));
+  x28 = (x26 >> 8);
+  x29 = (uint8_t)(x28 & UINT8_C(0xff));
+  x30 = (x28 >> 8);
+  x31 = (uint8_t)(x30 & UINT8_C(0xff));
+  x32 = (uint8_t)(x30 >> 8);
+  x33 = (uint8_t)(x2 & UINT8_C(0xff));
+  x34 = (x2 >> 8);
+  x35 = (uint8_t)(x34 & UINT8_C(0xff));
+  x36 = (x34 >> 8);
+  x37 = (uint8_t)(x36 & UINT8_C(0xff));
+  x38 = (x36 >> 8);
+  x39 = (uint8_t)(x38 & UINT8_C(0xff));
+  x40 = (x38 >> 8);
+  x41 = (uint8_t)(x40 & UINT8_C(0xff));
+  x42 = (x40 >> 8);
+  x43 = (uint8_t)(x42 & UINT8_C(0xff));
+  x44 = (x42 >> 8);
+  x45 = (uint8_t)(x44 & UINT8_C(0xff));
+  x46 = (uint8_t)(x44 >> 8);
+  x47 = (uint8_t)(x1 & UINT8_C(0xff));
+  x48 = (x1 >> 8);
+  x49 = (uint8_t)(x48 & UINT8_C(0xff));
+  x50 = (x48 >> 8);
+  x51 = (uint8_t)(x50 & UINT8_C(0xff));
+  x52 = (x50 >> 8);
+  x53 = (uint8_t)(x52 & UINT8_C(0xff));
+  x54 = (x52 >> 8);
+  x55 = (uint8_t)(x54 & UINT8_C(0xff));
+  x56 = (x54 >> 8);
+  x57 = (uint8_t)(x56 & UINT8_C(0xff));
+  x58 = (x56 >> 8);
+  x59 = (uint8_t)(x58 & UINT8_C(0xff));
+  x60 = (uint8_t)(x58 >> 8);
+  out1[0] = x5;
+  out1[1] = x7;
+  out1[2] = x9;
+  out1[3] = x11;
+  out1[4] = x13;
+  out1[5] = x15;
+  out1[6] = x17;
+  out1[7] = x18;
+  out1[8] = x19;
+  out1[9] = x21;
+  out1[10] = x23;
+  out1[11] = x25;
+  out1[12] = x27;
+  out1[13] = x29;
+  out1[14] = x31;
+  out1[15] = x32;
+  out1[16] = x33;
+  out1[17] = x35;
+  out1[18] = x37;
+  out1[19] = x39;
+  out1[20] = x41;
+  out1[21] = x43;
+  out1[22] = x45;
+  out1[23] = x46;
+  out1[24] = x47;
+  out1[25] = x49;
+  out1[26] = x51;
+  out1[27] = x53;
+  out1[28] = x55;
+  out1[29] = x57;
+  out1[30] = x59;
+  out1[31] = x60;
 }
 
 /*
- * The function fiat_p256_from_bytes deserializes a field element in the Montgomery domain from bytes in little-endian order.
+ * The function fiat_p256_from_bytes deserializes a field element NOT in the Montgomery domain from bytes in little-endian order.
+ *
  * Preconditions:
  *   0 ≤ bytes_eval arg1 < m
  * Postconditions:
@@ -1178,49 +1568,444 @@
  * Output Bounds:
  *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
  */
-static void fiat_p256_from_bytes(uint64_t out1[4], const uint8_t arg1[32]) {
-  uint64_t x1 = ((uint64_t)(arg1[31]) << 56);
-  uint64_t x2 = ((uint64_t)(arg1[30]) << 48);
-  uint64_t x3 = ((uint64_t)(arg1[29]) << 40);
-  uint64_t x4 = ((uint64_t)(arg1[28]) << 32);
-  uint64_t x5 = ((uint64_t)(arg1[27]) << 24);
-  uint64_t x6 = ((uint64_t)(arg1[26]) << 16);
-  uint64_t x7 = ((uint64_t)(arg1[25]) << 8);
-  uint8_t x8 = (arg1[24]);
-  uint64_t x9 = ((uint64_t)(arg1[23]) << 56);
-  uint64_t x10 = ((uint64_t)(arg1[22]) << 48);
-  uint64_t x11 = ((uint64_t)(arg1[21]) << 40);
-  uint64_t x12 = ((uint64_t)(arg1[20]) << 32);
-  uint64_t x13 = ((uint64_t)(arg1[19]) << 24);
-  uint64_t x14 = ((uint64_t)(arg1[18]) << 16);
-  uint64_t x15 = ((uint64_t)(arg1[17]) << 8);
-  uint8_t x16 = (arg1[16]);
-  uint64_t x17 = ((uint64_t)(arg1[15]) << 56);
-  uint64_t x18 = ((uint64_t)(arg1[14]) << 48);
-  uint64_t x19 = ((uint64_t)(arg1[13]) << 40);
-  uint64_t x20 = ((uint64_t)(arg1[12]) << 32);
-  uint64_t x21 = ((uint64_t)(arg1[11]) << 24);
-  uint64_t x22 = ((uint64_t)(arg1[10]) << 16);
-  uint64_t x23 = ((uint64_t)(arg1[9]) << 8);
-  uint8_t x24 = (arg1[8]);
-  uint64_t x25 = ((uint64_t)(arg1[7]) << 56);
-  uint64_t x26 = ((uint64_t)(arg1[6]) << 48);
-  uint64_t x27 = ((uint64_t)(arg1[5]) << 40);
-  uint64_t x28 = ((uint64_t)(arg1[4]) << 32);
-  uint64_t x29 = ((uint64_t)(arg1[3]) << 24);
-  uint64_t x30 = ((uint64_t)(arg1[2]) << 16);
-  uint64_t x31 = ((uint64_t)(arg1[1]) << 8);
-  uint8_t x32 = (arg1[0]);
-  uint64_t x33 = (x32 + (x31 + (x30 + (x29 + (x28 + (x27 + (x26 + x25)))))));
-  uint64_t x34 = (x33 & UINT64_C(0xffffffffffffffff));
-  uint64_t x35 = (x8 + (x7 + (x6 + (x5 + (x4 + (x3 + (x2 + x1)))))));
-  uint64_t x36 = (x16 + (x15 + (x14 + (x13 + (x12 + (x11 + (x10 + x9)))))));
-  uint64_t x37 = (x24 + (x23 + (x22 + (x21 + (x20 + (x19 + (x18 + x17)))))));
-  uint64_t x38 = (x37 & UINT64_C(0xffffffffffffffff));
-  uint64_t x39 = (x36 & UINT64_C(0xffffffffffffffff));
-  out1[0] = x34;
-  out1[1] = x38;
-  out1[2] = x39;
-  out1[3] = x35;
+static FIAT_P256_FIAT_INLINE void fiat_p256_from_bytes(uint64_t out1[4], const uint8_t arg1[32]) {
+  uint64_t x1;
+  uint64_t x2;
+  uint64_t x3;
+  uint64_t x4;
+  uint64_t x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint8_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  uint64_t x13;
+  uint64_t x14;
+  uint64_t x15;
+  uint8_t x16;
+  uint64_t x17;
+  uint64_t x18;
+  uint64_t x19;
+  uint64_t x20;
+  uint64_t x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint8_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  uint8_t x32;
+  uint64_t x33;
+  uint64_t x34;
+  uint64_t x35;
+  uint64_t x36;
+  uint64_t x37;
+  uint64_t x38;
+  uint64_t x39;
+  uint64_t x40;
+  uint64_t x41;
+  uint64_t x42;
+  uint64_t x43;
+  uint64_t x44;
+  uint64_t x45;
+  uint64_t x46;
+  uint64_t x47;
+  uint64_t x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  uint64_t x54;
+  uint64_t x55;
+  uint64_t x56;
+  uint64_t x57;
+  uint64_t x58;
+  uint64_t x59;
+  uint64_t x60;
+  x1 = ((uint64_t)(arg1[31]) << 56);
+  x2 = ((uint64_t)(arg1[30]) << 48);
+  x3 = ((uint64_t)(arg1[29]) << 40);
+  x4 = ((uint64_t)(arg1[28]) << 32);
+  x5 = ((uint64_t)(arg1[27]) << 24);
+  x6 = ((uint64_t)(arg1[26]) << 16);
+  x7 = ((uint64_t)(arg1[25]) << 8);
+  x8 = (arg1[24]);
+  x9 = ((uint64_t)(arg1[23]) << 56);
+  x10 = ((uint64_t)(arg1[22]) << 48);
+  x11 = ((uint64_t)(arg1[21]) << 40);
+  x12 = ((uint64_t)(arg1[20]) << 32);
+  x13 = ((uint64_t)(arg1[19]) << 24);
+  x14 = ((uint64_t)(arg1[18]) << 16);
+  x15 = ((uint64_t)(arg1[17]) << 8);
+  x16 = (arg1[16]);
+  x17 = ((uint64_t)(arg1[15]) << 56);
+  x18 = ((uint64_t)(arg1[14]) << 48);
+  x19 = ((uint64_t)(arg1[13]) << 40);
+  x20 = ((uint64_t)(arg1[12]) << 32);
+  x21 = ((uint64_t)(arg1[11]) << 24);
+  x22 = ((uint64_t)(arg1[10]) << 16);
+  x23 = ((uint64_t)(arg1[9]) << 8);
+  x24 = (arg1[8]);
+  x25 = ((uint64_t)(arg1[7]) << 56);
+  x26 = ((uint64_t)(arg1[6]) << 48);
+  x27 = ((uint64_t)(arg1[5]) << 40);
+  x28 = ((uint64_t)(arg1[4]) << 32);
+  x29 = ((uint64_t)(arg1[3]) << 24);
+  x30 = ((uint64_t)(arg1[2]) << 16);
+  x31 = ((uint64_t)(arg1[1]) << 8);
+  x32 = (arg1[0]);
+  x33 = (x31 + (uint64_t)x32);
+  x34 = (x30 + x33);
+  x35 = (x29 + x34);
+  x36 = (x28 + x35);
+  x37 = (x27 + x36);
+  x38 = (x26 + x37);
+  x39 = (x25 + x38);
+  x40 = (x23 + (uint64_t)x24);
+  x41 = (x22 + x40);
+  x42 = (x21 + x41);
+  x43 = (x20 + x42);
+  x44 = (x19 + x43);
+  x45 = (x18 + x44);
+  x46 = (x17 + x45);
+  x47 = (x15 + (uint64_t)x16);
+  x48 = (x14 + x47);
+  x49 = (x13 + x48);
+  x50 = (x12 + x49);
+  x51 = (x11 + x50);
+  x52 = (x10 + x51);
+  x53 = (x9 + x52);
+  x54 = (x7 + (uint64_t)x8);
+  x55 = (x6 + x54);
+  x56 = (x5 + x55);
+  x57 = (x4 + x56);
+  x58 = (x3 + x57);
+  x59 = (x2 + x58);
+  x60 = (x1 + x59);
+  out1[0] = x39;
+  out1[1] = x46;
+  out1[2] = x53;
+  out1[3] = x60;
 }
 
+/*
+ * The function fiat_p256_set_one returns the field element one in the Montgomery domain.
+ *
+ * Postconditions:
+ *   eval (from_montgomery out1) mod m = 1 mod m
+ *   0 ≤ eval out1 < m
+ *
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_set_one(fiat_p256_montgomery_domain_field_element out1) {
+  out1[0] = 0x1;
+  out1[1] = UINT64_C(0xffffffff00000000);
+  out1[2] = UINT64_C(0xffffffffffffffff);
+  out1[3] = UINT32_C(0xfffffffe);
+}
+
+/*
+ * The function fiat_p256_msat returns the saturated representation of the prime modulus.
+ *
+ * Postconditions:
+ *   twos_complement_eval out1 = m
+ *   0 ≤ eval out1 < m
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_msat(uint64_t out1[5]) {
+  out1[0] = UINT64_C(0xffffffffffffffff);
+  out1[1] = UINT32_C(0xffffffff);
+  out1[2] = 0x0;
+  out1[3] = UINT64_C(0xffffffff00000001);
+  out1[4] = 0x0;
+}
+
+/*
+ * The function fiat_p256_divstep computes a divstep.
+ *
+ * Preconditions:
+ *   0 ≤ eval arg4 < m
+ *   0 ≤ eval arg5 < m
+ * Postconditions:
+ *   out1 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then 1 - arg1 else 1 + arg1)
+ *   twos_complement_eval out2 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then twos_complement_eval arg3 else twos_complement_eval arg2)
+ *   twos_complement_eval out3 = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then ⌊(twos_complement_eval arg3 - twos_complement_eval arg2) / 2⌋ else ⌊(twos_complement_eval arg3 + (twos_complement_eval arg3 mod 2) * twos_complement_eval arg2) / 2⌋)
+ *   eval (from_montgomery out4) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (2 * eval (from_montgomery arg5)) mod m else (2 * eval (from_montgomery arg4)) mod m)
+ *   eval (from_montgomery out5) mod m = (if 0 < arg1 ∧ (twos_complement_eval arg3) is odd then (eval (from_montgomery arg4) - eval (from_montgomery arg4)) mod m else (eval (from_montgomery arg5) + (twos_complement_eval arg3 mod 2) * eval (from_montgomery arg4)) mod m)
+ *   0 ≤ eval out5 < m
+ *   0 ≤ eval out5 < m
+ *   0 ≤ eval out2 < m
+ *   0 ≤ eval out3 < m
+ *
+ * Input Bounds:
+ *   arg1: [0x0 ~> 0xffffffffffffffff]
+ *   arg2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   arg5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ * Output Bounds:
+ *   out1: [0x0 ~> 0xffffffffffffffff]
+ *   out2: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out3: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out4: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ *   out5: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_divstep(uint64_t* out1, uint64_t out2[5], uint64_t out3[5], uint64_t out4[4], uint64_t out5[4], uint64_t arg1, const uint64_t arg2[5], const uint64_t arg3[5], const uint64_t arg4[4], const uint64_t arg5[4]) {
+  uint64_t x1;
+  fiat_p256_uint1 x2;
+  fiat_p256_uint1 x3;
+  uint64_t x4;
+  fiat_p256_uint1 x5;
+  uint64_t x6;
+  uint64_t x7;
+  uint64_t x8;
+  uint64_t x9;
+  uint64_t x10;
+  uint64_t x11;
+  uint64_t x12;
+  fiat_p256_uint1 x13;
+  uint64_t x14;
+  fiat_p256_uint1 x15;
+  uint64_t x16;
+  fiat_p256_uint1 x17;
+  uint64_t x18;
+  fiat_p256_uint1 x19;
+  uint64_t x20;
+  fiat_p256_uint1 x21;
+  uint64_t x22;
+  uint64_t x23;
+  uint64_t x24;
+  uint64_t x25;
+  uint64_t x26;
+  uint64_t x27;
+  uint64_t x28;
+  uint64_t x29;
+  uint64_t x30;
+  uint64_t x31;
+  fiat_p256_uint1 x32;
+  uint64_t x33;
+  fiat_p256_uint1 x34;
+  uint64_t x35;
+  fiat_p256_uint1 x36;
+  uint64_t x37;
+  fiat_p256_uint1 x38;
+  uint64_t x39;
+  fiat_p256_uint1 x40;
+  uint64_t x41;
+  fiat_p256_uint1 x42;
+  uint64_t x43;
+  fiat_p256_uint1 x44;
+  uint64_t x45;
+  fiat_p256_uint1 x46;
+  uint64_t x47;
+  fiat_p256_uint1 x48;
+  uint64_t x49;
+  uint64_t x50;
+  uint64_t x51;
+  uint64_t x52;
+  uint64_t x53;
+  fiat_p256_uint1 x54;
+  uint64_t x55;
+  fiat_p256_uint1 x56;
+  uint64_t x57;
+  fiat_p256_uint1 x58;
+  uint64_t x59;
+  fiat_p256_uint1 x60;
+  uint64_t x61;
+  uint64_t x62;
+  fiat_p256_uint1 x63;
+  uint64_t x64;
+  fiat_p256_uint1 x65;
+  uint64_t x66;
+  fiat_p256_uint1 x67;
+  uint64_t x68;
+  fiat_p256_uint1 x69;
+  uint64_t x70;
+  uint64_t x71;
+  uint64_t x72;
+  uint64_t x73;
+  fiat_p256_uint1 x74;
+  uint64_t x75;
+  uint64_t x76;
+  uint64_t x77;
+  uint64_t x78;
+  uint64_t x79;
+  uint64_t x80;
+  fiat_p256_uint1 x81;
+  uint64_t x82;
+  fiat_p256_uint1 x83;
+  uint64_t x84;
+  fiat_p256_uint1 x85;
+  uint64_t x86;
+  fiat_p256_uint1 x87;
+  uint64_t x88;
+  fiat_p256_uint1 x89;
+  uint64_t x90;
+  uint64_t x91;
+  uint64_t x92;
+  uint64_t x93;
+  uint64_t x94;
+  fiat_p256_uint1 x95;
+  uint64_t x96;
+  fiat_p256_uint1 x97;
+  uint64_t x98;
+  fiat_p256_uint1 x99;
+  uint64_t x100;
+  fiat_p256_uint1 x101;
+  uint64_t x102;
+  fiat_p256_uint1 x103;
+  uint64_t x104;
+  fiat_p256_uint1 x105;
+  uint64_t x106;
+  fiat_p256_uint1 x107;
+  uint64_t x108;
+  fiat_p256_uint1 x109;
+  uint64_t x110;
+  fiat_p256_uint1 x111;
+  uint64_t x112;
+  fiat_p256_uint1 x113;
+  uint64_t x114;
+  uint64_t x115;
+  uint64_t x116;
+  uint64_t x117;
+  uint64_t x118;
+  uint64_t x119;
+  uint64_t x120;
+  uint64_t x121;
+  uint64_t x122;
+  uint64_t x123;
+  uint64_t x124;
+  uint64_t x125;
+  uint64_t x126;
+  fiat_p256_addcarryx_u64(&x1, &x2, 0x0, (~arg1), 0x1);
+  x3 = (fiat_p256_uint1)((fiat_p256_uint1)(x1 >> 63) & (fiat_p256_uint1)((arg3[0]) & 0x1));
+  fiat_p256_addcarryx_u64(&x4, &x5, 0x0, (~arg1), 0x1);
+  fiat_p256_cmovznz_u64(&x6, x3, arg1, x4);
+  fiat_p256_cmovznz_u64(&x7, x3, (arg2[0]), (arg3[0]));
+  fiat_p256_cmovznz_u64(&x8, x3, (arg2[1]), (arg3[1]));
+  fiat_p256_cmovznz_u64(&x9, x3, (arg2[2]), (arg3[2]));
+  fiat_p256_cmovznz_u64(&x10, x3, (arg2[3]), (arg3[3]));
+  fiat_p256_cmovznz_u64(&x11, x3, (arg2[4]), (arg3[4]));
+  fiat_p256_addcarryx_u64(&x12, &x13, 0x0, 0x1, (~(arg2[0])));
+  fiat_p256_addcarryx_u64(&x14, &x15, x13, 0x0, (~(arg2[1])));
+  fiat_p256_addcarryx_u64(&x16, &x17, x15, 0x0, (~(arg2[2])));
+  fiat_p256_addcarryx_u64(&x18, &x19, x17, 0x0, (~(arg2[3])));
+  fiat_p256_addcarryx_u64(&x20, &x21, x19, 0x0, (~(arg2[4])));
+  fiat_p256_cmovznz_u64(&x22, x3, (arg3[0]), x12);
+  fiat_p256_cmovznz_u64(&x23, x3, (arg3[1]), x14);
+  fiat_p256_cmovznz_u64(&x24, x3, (arg3[2]), x16);
+  fiat_p256_cmovznz_u64(&x25, x3, (arg3[3]), x18);
+  fiat_p256_cmovznz_u64(&x26, x3, (arg3[4]), x20);
+  fiat_p256_cmovznz_u64(&x27, x3, (arg4[0]), (arg5[0]));
+  fiat_p256_cmovznz_u64(&x28, x3, (arg4[1]), (arg5[1]));
+  fiat_p256_cmovznz_u64(&x29, x3, (arg4[2]), (arg5[2]));
+  fiat_p256_cmovznz_u64(&x30, x3, (arg4[3]), (arg5[3]));
+  fiat_p256_addcarryx_u64(&x31, &x32, 0x0, x27, x27);
+  fiat_p256_addcarryx_u64(&x33, &x34, x32, x28, x28);
+  fiat_p256_addcarryx_u64(&x35, &x36, x34, x29, x29);
+  fiat_p256_addcarryx_u64(&x37, &x38, x36, x30, x30);
+  fiat_p256_subborrowx_u64(&x39, &x40, 0x0, x31, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x41, &x42, x40, x33, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x43, &x44, x42, x35, 0x0);
+  fiat_p256_subborrowx_u64(&x45, &x46, x44, x37, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x47, &x48, x46, x38, 0x0);
+  x49 = (arg4[3]);
+  x50 = (arg4[2]);
+  x51 = (arg4[1]);
+  x52 = (arg4[0]);
+  fiat_p256_subborrowx_u64(&x53, &x54, 0x0, 0x0, x52);
+  fiat_p256_subborrowx_u64(&x55, &x56, x54, 0x0, x51);
+  fiat_p256_subborrowx_u64(&x57, &x58, x56, 0x0, x50);
+  fiat_p256_subborrowx_u64(&x59, &x60, x58, 0x0, x49);
+  fiat_p256_cmovznz_u64(&x61, x60, 0x0, UINT64_C(0xffffffffffffffff));
+  fiat_p256_addcarryx_u64(&x62, &x63, 0x0, x53, x61);
+  fiat_p256_addcarryx_u64(&x64, &x65, x63, x55, (x61 & UINT32_C(0xffffffff)));
+  fiat_p256_addcarryx_u64(&x66, &x67, x65, x57, 0x0);
+  fiat_p256_addcarryx_u64(&x68, &x69, x67, x59, (x61 & UINT64_C(0xffffffff00000001)));
+  fiat_p256_cmovznz_u64(&x70, x3, (arg5[0]), x62);
+  fiat_p256_cmovznz_u64(&x71, x3, (arg5[1]), x64);
+  fiat_p256_cmovznz_u64(&x72, x3, (arg5[2]), x66);
+  fiat_p256_cmovznz_u64(&x73, x3, (arg5[3]), x68);
+  x74 = (fiat_p256_uint1)(x22 & 0x1);
+  fiat_p256_cmovznz_u64(&x75, x74, 0x0, x7);
+  fiat_p256_cmovznz_u64(&x76, x74, 0x0, x8);
+  fiat_p256_cmovznz_u64(&x77, x74, 0x0, x9);
+  fiat_p256_cmovznz_u64(&x78, x74, 0x0, x10);
+  fiat_p256_cmovznz_u64(&x79, x74, 0x0, x11);
+  fiat_p256_addcarryx_u64(&x80, &x81, 0x0, x22, x75);
+  fiat_p256_addcarryx_u64(&x82, &x83, x81, x23, x76);
+  fiat_p256_addcarryx_u64(&x84, &x85, x83, x24, x77);
+  fiat_p256_addcarryx_u64(&x86, &x87, x85, x25, x78);
+  fiat_p256_addcarryx_u64(&x88, &x89, x87, x26, x79);
+  fiat_p256_cmovznz_u64(&x90, x74, 0x0, x27);
+  fiat_p256_cmovznz_u64(&x91, x74, 0x0, x28);
+  fiat_p256_cmovznz_u64(&x92, x74, 0x0, x29);
+  fiat_p256_cmovznz_u64(&x93, x74, 0x0, x30);
+  fiat_p256_addcarryx_u64(&x94, &x95, 0x0, x70, x90);
+  fiat_p256_addcarryx_u64(&x96, &x97, x95, x71, x91);
+  fiat_p256_addcarryx_u64(&x98, &x99, x97, x72, x92);
+  fiat_p256_addcarryx_u64(&x100, &x101, x99, x73, x93);
+  fiat_p256_subborrowx_u64(&x102, &x103, 0x0, x94, UINT64_C(0xffffffffffffffff));
+  fiat_p256_subborrowx_u64(&x104, &x105, x103, x96, UINT32_C(0xffffffff));
+  fiat_p256_subborrowx_u64(&x106, &x107, x105, x98, 0x0);
+  fiat_p256_subborrowx_u64(&x108, &x109, x107, x100, UINT64_C(0xffffffff00000001));
+  fiat_p256_subborrowx_u64(&x110, &x111, x109, x101, 0x0);
+  fiat_p256_addcarryx_u64(&x112, &x113, 0x0, x6, 0x1);
+  x114 = ((x80 >> 1) | ((x82 << 63) & UINT64_C(0xffffffffffffffff)));
+  x115 = ((x82 >> 1) | ((x84 << 63) & UINT64_C(0xffffffffffffffff)));
+  x116 = ((x84 >> 1) | ((x86 << 63) & UINT64_C(0xffffffffffffffff)));
+  x117 = ((x86 >> 1) | ((x88 << 63) & UINT64_C(0xffffffffffffffff)));
+  x118 = ((x88 & UINT64_C(0x8000000000000000)) | (x88 >> 1));
+  fiat_p256_cmovznz_u64(&x119, x48, x39, x31);
+  fiat_p256_cmovznz_u64(&x120, x48, x41, x33);
+  fiat_p256_cmovznz_u64(&x121, x48, x43, x35);
+  fiat_p256_cmovznz_u64(&x122, x48, x45, x37);
+  fiat_p256_cmovznz_u64(&x123, x111, x102, x94);
+  fiat_p256_cmovznz_u64(&x124, x111, x104, x96);
+  fiat_p256_cmovznz_u64(&x125, x111, x106, x98);
+  fiat_p256_cmovznz_u64(&x126, x111, x108, x100);
+  *out1 = x112;
+  out2[0] = x7;
+  out2[1] = x8;
+  out2[2] = x9;
+  out2[3] = x10;
+  out2[4] = x11;
+  out3[0] = x114;
+  out3[1] = x115;
+  out3[2] = x116;
+  out3[3] = x117;
+  out3[4] = x118;
+  out4[0] = x119;
+  out4[1] = x120;
+  out4[2] = x121;
+  out4[3] = x122;
+  out5[0] = x123;
+  out5[1] = x124;
+  out5[2] = x125;
+  out5[3] = x126;
+}
+
+/*
+ * The function fiat_p256_divstep_precomp returns the precomputed value for Bernstein-Yang-inversion (in montgomery form).
+ *
+ * Postconditions:
+ *   eval (from_montgomery out1) = ⌊(m - 1) / 2⌋^(if ⌊log2 m⌋ + 1 < 46 then ⌊(49 * (⌊log2 m⌋ + 1) + 80) / 17⌋ else ⌊(49 * (⌊log2 m⌋ + 1) + 57) / 17⌋)
+ *   0 ≤ eval out1 < m
+ *
+ * Output Bounds:
+ *   out1: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
+ */
+static FIAT_P256_FIAT_INLINE void fiat_p256_divstep_precomp(uint64_t out1[4]) {
+  out1[0] = UINT64_C(0x67ffffffb8000000);
+  out1[1] = UINT64_C(0xc000000038000000);
+  out1[2] = UINT64_C(0xd80000007fffffff);
+  out1[3] = UINT64_C(0x2fffffffffffffff);
+}