Add option to avoid 64-bit multiplication Motivation is similar to NO_UDBL_DIVISION. The alternative implementation of 64-bit mult is straightforward and aims at obvious correctness. Also, visual examination of the generate assembly show that it's quite efficient with clang, armcc5 and arm-clang. However current GCC generates fairly inefficient code for it. I tried to rework the code in order to make GCC generate more efficient code. Unfortunately the only way to do that is to get rid of 64-bit add and handle the carry manually, but this causes other compilers to generate less efficient code with branches, which is not acceptable from a side-channel point of view. So let's keep the obvious code that works for most compilers and hope future versions of GCC learn to manage registers in a sensible way in that context. See https://bugs.launchpad.net/gcc-arm-embedded/+bug/1775263

commit: 2adb375c50e2db5f44dd1ce8b7cb4b33b035563a [log] [tgz]
author: Manuel Pégourié-Gonnard <manuel.pegourie-gonnard@arm.com> Thu Jun 07 10:51:44 2018 +0200
committer: Manuel Pégourié-Gonnard <manuel.pegourie-gonnard@arm.com> Thu Jun 07 11:05:33 2018 +0200
tree: ac64868554c1f24d7f5702c3467acf840219d322
parent: 94175a50f7ec89ecf704b92f6d90bfc9d33dbdf4 [diff]
diff --git a/include/mbedtls/config.h b/include/mbedtls/config.h
index 663c984..bde5a45 100644
--- a/include/mbedtls/config.h
+++ b/include/mbedtls/config.h

@@ -85,6 +85,28 @@
 //#define MBEDTLS_NO_UDBL_DIVISION
 
 /**
+ * \def MBEDTLS_NO_64BIT_MULTIPLICATION
+ *
+ * The platform lacks support for 32x32 -> 64-bit multiplication.
+ *
+ * Used in:
+ *      library/poly1305.c
+ *
+ * Some parts of the library may use multiplication of two unsigned 32-bit
+ * operands with a 64-bit result in order to speed up computations. On some
+ * platforms, this is not available in hardware and has to be implemented in
+ * software, usually in a library provided by the toolchain.
+ *
+ * Sometimes it is not desirable to have to link to that library. This option
+ * removes the dependency of that library on platforms that lack a hardware
+ * 64-bit multiplier by embedding a software implementation in Mbed TLS.
+ *
+ * Note that depending on the compiler, this may decrease performance compared
+ * to using the library function provided by the toolchain.
+ */
+//#define MBEDTLS_NO_64BIT_MULTIPLICATION
+
+/**
  * \def MBEDTLS_HAVE_SSE2
  *
  * CPU supports SSE2 instruction set.

diff --git a/library/poly1305.c b/library/poly1305.c
index 41e83f3..bafe613 100644
--- a/library/poly1305.c
+++ b/library/poly1305.c

@@ -53,6 +53,34 @@
           | (uint32_t) ( (uint32_t) data[( offset ) + 3] << 24 )  \
     )
 
+/*
+ * Our implementation is tuned for 32-bit platforms with a 64-bit multiplier.
+ * However we provided an alternative for platforms without such a multiplier.
+ */
+#if defined(MBEDTLS_NO_64BIT_MULTIPLICATION)
+static uint64_t mul64( uint32_t a, uint32_t b )
+{
+    /* a = al + 2**16 ah, b = bl + 2**16 bh */
+    const uint16_t al = (uint16_t) a;
+    const uint16_t bl = (uint16_t) b;
+    const uint16_t ah = a >> 16;
+    const uint16_t bh = b >> 16;
+
+    /* ab = al*bl + 2**16 (ah*bl + bl*bh) + 2**32 ah*bh */
+    const uint32_t lo = (uint32_t) al * bl;
+    const uint64_t me = (uint64_t)( (uint32_t) ah * bl ) + (uint32_t) al * bh;
+    const uint32_t hi = (uint32_t) ah * bh;
+
+    return( lo + ( me << 16 ) + ( (uint64_t) hi << 32 ) );
+}
+#else
+static inline uint64_t mul64( uint32_t a, uint32_t b )
+{
+    return( (uint64_t) a * b );
+}
+#endif
+
+
 /**
  * \brief                   Process blocks with Poly1305.
  *
@@ -112,25 +140,25 @@
         acc4 += (uint32_t) ( d3 >> 32U ) + needs_padding;
 
         /* Compute: acc *= r */
-        d0 = ( (uint64_t) acc0 * r0  ) +
-             ( (uint64_t) acc1 * rs3 ) +
-             ( (uint64_t) acc2 * rs2 ) +
-             ( (uint64_t) acc3 * rs1 );
-        d1 = ( (uint64_t) acc0 * r1  ) +
-             ( (uint64_t) acc1 * r0  ) +
-             ( (uint64_t) acc2 * rs3 ) +
-             ( (uint64_t) acc3 * rs2 ) +
-             ( (uint64_t) acc4 * rs1 );
-        d2 = ( (uint64_t) acc0 * r2  ) +
-             ( (uint64_t) acc1 * r1  ) +
-             ( (uint64_t) acc2 * r0  ) +
-             ( (uint64_t) acc3 * rs3 ) +
-             ( (uint64_t) acc4 * rs2 );
-        d3 = ( (uint64_t) acc0 * r3  ) +
-             ( (uint64_t) acc1 * r2  ) +
-             ( (uint64_t) acc2 * r1  ) +
-             ( (uint64_t) acc3 * r0  ) +
-             ( (uint64_t) acc4 * rs3 );
+        d0 = mul64( acc0, r0  ) +
+             mul64( acc1, rs3 ) +
+             mul64( acc2, rs2 ) +
+             mul64( acc3, rs1 );
+        d1 = mul64( acc0, r1  ) +
+             mul64( acc1, r0  ) +
+             mul64( acc2, rs3 ) +
+             mul64( acc3, rs2 ) +
+             mul64( acc4, rs1 );
+        d2 = mul64( acc0, r2  ) +
+             mul64( acc1, r1  ) +
+             mul64( acc2, r0  ) +
+             mul64( acc3, rs3 ) +
+             mul64( acc4, rs2 );
+        d3 = mul64( acc0, r3  ) +
+             mul64( acc1, r2  ) +
+             mul64( acc2, r1  ) +
+             mul64( acc3, r0  ) +
+             mul64( acc4, rs3 );
         acc4 *= r0;
 
         /* Compute: acc %= (2^130 - 5) (partial remainder) */

diff --git a/library/version_features.c b/library/version_features.c
index c0a5a3c..21b3477 100644
--- a/library/version_features.c
+++ b/library/version_features.c

@@ -39,6 +39,9 @@
 #if defined(MBEDTLS_NO_UDBL_DIVISION)
     "MBEDTLS_NO_UDBL_DIVISION",
 #endif /* MBEDTLS_NO_UDBL_DIVISION */
+#if defined(MBEDTLS_NO_64BIT_MULTIPLICATION)
+    "MBEDTLS_NO_64BIT_MULTIPLICATION",
+#endif /* MBEDTLS_NO_64BIT_MULTIPLICATION */
 #if defined(MBEDTLS_HAVE_SSE2)
     "MBEDTLS_HAVE_SSE2",
 #endif /* MBEDTLS_HAVE_SSE2 */

diff --git a/scripts/config.pl b/scripts/config.pl
index 5bf2785..a89787a 100755
--- a/scripts/config.pl
+++ b/scripts/config.pl

@@ -95,6 +95,7 @@
 MBEDTLS_ZLIB_SUPPORT
 MBEDTLS_PKCS11_C
 MBEDTLS_NO_UDBL_DIVISION
+MBEDTLS_NO_64BIT_MULTIPLICATION
 _ALT\s*$
 );
 

diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh
index e6c7549..83011f5 100755
--- a/tests/scripts/all.sh
+++ b/tests/scripts/all.sh

@@ -344,6 +344,12 @@
     fi
 }
 
+# to be used instead of ! for commands run with
+# record_status or if_build_succeeded
+not() {
+    ! "$@"
+}
+
 msg "info: $0 configuration"
 echo "MEMORY: $MEMORY"
 echo "FORCE: $FORCE"
@@ -691,6 +697,31 @@
 msg "test: gcc, force 64-bit bignum limbs"
 make test
 
+
+msg "build: MBEDTLS_NO_UDBL_DIVISION native" # ~ 10s
+cleanup
+cp "$CONFIG_H" "$CONFIG_BAK"
+scripts/config.pl full
+scripts/config.pl unset MBEDTLS_MEMORY_BACKTRACE # too slow for tests
+scripts/config.pl set MBEDTLS_NO_UDBL_DIVISION
+make CFLAGS='-Werror -O1'
+
+msg "test: MBEDTLS_NO_UDBL_DIVISION native" # ~ 10s
+make test
+
+
+msg "build: MBEDTLS_NO_64BIT_MULTIPLICATION native" # ~ 10s
+cleanup
+cp "$CONFIG_H" "$CONFIG_BAK"
+scripts/config.pl full
+scripts/config.pl unset MBEDTLS_MEMORY_BACKTRACE # too slow for tests
+scripts/config.pl set MBEDTLS_NO_64BIT_MULTIPLICATION
+make CFLAGS='-Werror -O1'
+
+msg "test: MBEDTLS_NO_64BIT_MULTIPLICATION native" # ~ 10s
+make test
+
+
 msg "build: arm-none-eabi-gcc, make" # ~ 10s
 cleanup
 cp "$CONFIG_H" "$CONFIG_BAK"
@@ -726,7 +757,27 @@
 scripts/config.pl set MBEDTLS_NO_UDBL_DIVISION
 make CC=arm-none-eabi-gcc AR=arm-none-eabi-ar LD=arm-none-eabi-ld CFLAGS='-Werror -Wall -Wextra' lib
 echo "Checking that software 64-bit division is not required"
-! grep __aeabi_uldiv library/*.o
+if_build_succeeded not grep __aeabi_uldiv library/*.o
+
+msg "build: arm-none-eabi-gcc MBEDTLS_NO_64BIT_MULTIPLICATION, make" # ~ 10s
+cleanup
+cp "$CONFIG_H" "$CONFIG_BAK"
+scripts/config.pl full
+scripts/config.pl unset MBEDTLS_NET_C
+scripts/config.pl unset MBEDTLS_TIMING_C
+scripts/config.pl unset MBEDTLS_FS_IO
+scripts/config.pl unset MBEDTLS_ENTROPY_NV_SEED
+scripts/config.pl set MBEDTLS_NO_PLATFORM_ENTROPY
+# following things are not in the default config
+scripts/config.pl unset MBEDTLS_HAVEGE_C # depends on timing.c
+scripts/config.pl unset MBEDTLS_THREADING_PTHREAD
+scripts/config.pl unset MBEDTLS_THREADING_C
+scripts/config.pl unset MBEDTLS_MEMORY_BACKTRACE # execinfo.h
+scripts/config.pl unset MBEDTLS_MEMORY_BUFFER_ALLOC_C # calls exit
+scripts/config.pl set MBEDTLS_NO_64BIT_MULTIPLICATION
+make CC=arm-none-eabi-gcc AR=arm-none-eabi-ar LD=arm-none-eabi-ld CFLAGS='-Werror -O1 -march=armv6-m -mthumb' lib
+echo "Checking that software 64-bit multiplication is not required"
+if_build_succeeded not grep __aeabi_lmul library/*.o
 
 msg "build: ARM Compiler 5, make"
 cleanup
commit	2adb375c50e2db5f44dd1ce8b7cb4b33b035563a	[log] [tgz]
author	Manuel Pégourié-Gonnard <manuel.pegourie-gonnard@arm.com>	Thu Jun 07 10:51:44 2018 +0200
committer	Manuel Pégourié-Gonnard <manuel.pegourie-gonnard@arm.com>	Thu Jun 07 11:05:33 2018 +0200
tree	ac64868554c1f24d7f5702c3467acf840219d322
parent	94175a50f7ec89ecf704b92f6d90bfc9d33dbdf4 [diff]