Add option to use smaller AES tables (table sizes reduced by 6144 bytes)

This patch adds MBEDTLS_AES_SMALL_TABLES option to reduce number of AES
look-up tables and thus save 6 KiB of memory. Enabling this option
cause performance hit MBEDTLS_AES_SMALL_TABLES of ~7% on ARM and ~15%
on x86-64.

Benchmark on Cortex-A7 (armhf):

Before:
  AES-CBC-128              :      14394 Kb/s,          0 cycles/byte
  AES-CBC-192              :      12442 Kb/s,          0 cycles/byte
  AES-CBC-256              :      10958 Kb/s,          0 cycles/byte

After:
  AES-CBC-128              :      13342 Kb/s,          0 cycles/byte
  AES-CBC-192              :      11469 Kb/s,          0 cycles/byte
  AES-CBC-256              :      10058 Kb/s,          0 cycles/byte

Benchmark on Intel Core i5-4570 (x86_64, 3.2 Ghz, no turbo):

Before:
  AES-CBC-128              :     215759 Kb/s,         14 cycles/byte
  AES-CBC-192              :     190884 Kb/s,         16 cycles/byte
  AES-CBC-256              :     171536 Kb/s,         18 cycles/byte

After:
  AES-CBC-128              :     185108 Kb/s,         16 cycles/byte
  AES-CBC-192              :     162839 Kb/s,         19 cycles/byte
  AES-CBC-256              :     144700 Kb/s,         21 cycles/byte
diff --git a/include/mbedtls/config.h b/include/mbedtls/config.h
index c4b8995..44def95 100644
--- a/include/mbedtls/config.h
+++ b/include/mbedtls/config.h
@@ -388,6 +388,15 @@
 //#define MBEDTLS_AES_ROM_TABLES
 
 /**
+ * \def MBEDTLS_AES_SMALL_TABLES
+ *
+ * Use less ROM/RAM for the AES implementation (saves about 6144 bytes).
+ *
+ * Uncomment this macro to use less memory for AES.
+ */
+//#define MBEDTLS_AES_SMALL_TABLES
+
+/**
  * \def MBEDTLS_CAMELLIA_SMALL_MEMORY
  *
  * Use less ROM for the Camellia implementation (saves about 768 bytes).
diff --git a/library/aes.c b/library/aes.c
index 5e01c4f..aabacf9 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -201,6 +201,8 @@
 static const uint32_t FT0[256] = { FT };
 #undef V
 
+#ifndef MBEDTLS_AES_SMALL_TABLES
+
 #define V(a,b,c,d) 0x##b##c##d##a
 static const uint32_t FT1[256] = { FT };
 #undef V
@@ -213,6 +215,8 @@
 static const uint32_t FT3[256] = { FT };
 #undef V
 
+#endif /* !MBEDTLS_AES_SMALL_TABLES */
+
 #undef FT
 
 /*
@@ -328,6 +332,8 @@
 static const uint32_t RT0[256] = { RT };
 #undef V
 
+#ifndef MBEDTLS_AES_SMALL_TABLES
+
 #define V(a,b,c,d) 0x##b##c##d##a
 static const uint32_t RT1[256] = { RT };
 #undef V
@@ -340,6 +346,8 @@
 static const uint32_t RT3[256] = { RT };
 #undef V
 
+#endif /* !MBEDTLS_AES_SMALL_TABLES */
+
 #undef RT
 
 /*
@@ -359,18 +367,22 @@
  */
 static unsigned char FSb[256];
 static uint32_t FT0[256];
+#ifndef MBEDTLS_AES_SMALL_TABLES
 static uint32_t FT1[256];
 static uint32_t FT2[256];
 static uint32_t FT3[256];
+#endif /* !MBEDTLS_AES_SMALL_TABLES */
 
 /*
  * Reverse S-box & tables
  */
 static unsigned char RSb[256];
 static uint32_t RT0[256];
+#ifndef MBEDTLS_AES_SMALL_TABLES
 static uint32_t RT1[256];
 static uint32_t RT2[256];
 static uint32_t RT3[256];
+#endif /* !MBEDTLS_AES_SMALL_TABLES */
 
 /*
  * Round constants
@@ -445,9 +457,11 @@
                  ( (uint32_t) x << 16 ) ^
                  ( (uint32_t) z << 24 );
 
+#ifndef MBEDTLS_AES_SMALL_TABLES
         FT1[i] = ROTL8( FT0[i] );
         FT2[i] = ROTL8( FT1[i] );
         FT3[i] = ROTL8( FT2[i] );
+#endif /* !MBEDTLS_AES_SMALL_TABLES */
 
         x = RSb[i];
 
@@ -456,14 +470,48 @@
                  ( (uint32_t) MUL( 0x0D, x ) << 16 ) ^
                  ( (uint32_t) MUL( 0x0B, x ) << 24 );
 
+#ifndef MBEDTLS_AES_SMALL_TABLES
         RT1[i] = ROTL8( RT0[i] );
         RT2[i] = ROTL8( RT1[i] );
         RT3[i] = ROTL8( RT2[i] );
+#endif /* !MBEDTLS_AES_SMALL_TABLES */
     }
 }
 
+#undef ROTL8
+
 #endif /* MBEDTLS_AES_ROM_TABLES */
 
+#ifdef MBEDTLS_AES_SMALL_TABLES
+
+#define ROTL8(x)  ( (uint32_t)( ( x ) <<  8 ) + (uint32_t)( ( x ) >> 24 ) )
+#define ROTL16(x) ( (uint32_t)( ( x ) << 16 ) + (uint32_t)( ( x ) >> 16 ) )
+#define ROTL24(x) ( (uint32_t)( ( x ) << 24 ) + (uint32_t)( ( x ) >>  8 ) )
+
+#define AES_RT0(idx) RT0[idx]
+#define AES_RT1(idx) ROTL8(  RT0[idx] )
+#define AES_RT2(idx) ROTL16( RT0[idx] )
+#define AES_RT3(idx) ROTL24( RT0[idx] )
+
+#define AES_FT0(idx) FT0[idx]
+#define AES_FT1(idx) ROTL8(  FT0[idx] )
+#define AES_FT2(idx) ROTL16( FT0[idx] )
+#define AES_FT3(idx) ROTL24( FT0[idx] )
+
+#else /* MBEDTLS_AES_SMALL_TABLES */
+
+#define AES_RT0(idx) RT0[idx]
+#define AES_RT1(idx) RT1[idx]
+#define AES_RT2(idx) RT2[idx]
+#define AES_RT3(idx) RT3[idx]
+
+#define AES_FT0(idx) FT0[idx]
+#define AES_FT1(idx) FT1[idx]
+#define AES_FT2(idx) FT2[idx]
+#define AES_FT3(idx) FT3[idx]
+
+#endif /* MBEDTLS_AES_SMALL_TABLES */
+
 void mbedtls_aes_init( mbedtls_aes_context *ctx )
 {
     memset( ctx, 0, sizeof( mbedtls_aes_context ) );
@@ -641,10 +689,10 @@
     {
         for( j = 0; j < 4; j++, SK++ )
         {
-            *RK++ = RT0[ FSb[ ( *SK       ) & 0xFF ] ] ^
-                    RT1[ FSb[ ( *SK >>  8 ) & 0xFF ] ] ^
-                    RT2[ FSb[ ( *SK >> 16 ) & 0xFF ] ] ^
-                    RT3[ FSb[ ( *SK >> 24 ) & 0xFF ] ];
+            *RK++ = AES_RT0( FSb[ ( *SK       ) & 0xFF ] ) ^
+                    AES_RT1( FSb[ ( *SK >>  8 ) & 0xFF ] ) ^
+                    AES_RT2( FSb[ ( *SK >> 16 ) & 0xFF ] ) ^
+                    AES_RT3( FSb[ ( *SK >> 24 ) & 0xFF ] );
         }
     }
 
@@ -660,50 +708,50 @@
 }
 #endif /* !MBEDTLS_AES_SETKEY_DEC_ALT */
 
-#define AES_FROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3)     \
-{                                               \
-    X0 = *RK++ ^ FT0[ ( Y0       ) & 0xFF ] ^   \
-                 FT1[ ( Y1 >>  8 ) & 0xFF ] ^   \
-                 FT2[ ( Y2 >> 16 ) & 0xFF ] ^   \
-                 FT3[ ( Y3 >> 24 ) & 0xFF ];    \
-                                                \
-    X1 = *RK++ ^ FT0[ ( Y1       ) & 0xFF ] ^   \
-                 FT1[ ( Y2 >>  8 ) & 0xFF ] ^   \
-                 FT2[ ( Y3 >> 16 ) & 0xFF ] ^   \
-                 FT3[ ( Y0 >> 24 ) & 0xFF ];    \
-                                                \
-    X2 = *RK++ ^ FT0[ ( Y2       ) & 0xFF ] ^   \
-                 FT1[ ( Y3 >>  8 ) & 0xFF ] ^   \
-                 FT2[ ( Y0 >> 16 ) & 0xFF ] ^   \
-                 FT3[ ( Y1 >> 24 ) & 0xFF ];    \
-                                                \
-    X3 = *RK++ ^ FT0[ ( Y3       ) & 0xFF ] ^   \
-                 FT1[ ( Y0 >>  8 ) & 0xFF ] ^   \
-                 FT2[ ( Y1 >> 16 ) & 0xFF ] ^   \
-                 FT3[ ( Y2 >> 24 ) & 0xFF ];    \
+#define AES_FROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3)         \
+{                                                   \
+    X0 = *RK++ ^ AES_FT0( ( Y0       ) & 0xFF ) ^   \
+                 AES_FT1( ( Y1 >>  8 ) & 0xFF ) ^   \
+                 AES_FT2( ( Y2 >> 16 ) & 0xFF ) ^   \
+                 AES_FT3( ( Y3 >> 24 ) & 0xFF );    \
+                                                    \
+    X1 = *RK++ ^ AES_FT0( ( Y1       ) & 0xFF ) ^   \
+                 AES_FT1( ( Y2 >>  8 ) & 0xFF ) ^   \
+                 AES_FT2( ( Y3 >> 16 ) & 0xFF ) ^   \
+                 AES_FT3( ( Y0 >> 24 ) & 0xFF );    \
+                                                    \
+    X2 = *RK++ ^ AES_FT0( ( Y2       ) & 0xFF ) ^   \
+                 AES_FT1( ( Y3 >>  8 ) & 0xFF ) ^   \
+                 AES_FT2( ( Y0 >> 16 ) & 0xFF ) ^   \
+                 AES_FT3( ( Y1 >> 24 ) & 0xFF );    \
+                                                    \
+    X3 = *RK++ ^ AES_FT0( ( Y3       ) & 0xFF ) ^   \
+                 AES_FT1( ( Y0 >>  8 ) & 0xFF ) ^   \
+                 AES_FT2( ( Y1 >> 16 ) & 0xFF ) ^   \
+                 AES_FT3( ( Y2 >> 24 ) & 0xFF );    \
 }
 
-#define AES_RROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3)     \
-{                                               \
-    X0 = *RK++ ^ RT0[ ( Y0       ) & 0xFF ] ^   \
-                 RT1[ ( Y3 >>  8 ) & 0xFF ] ^   \
-                 RT2[ ( Y2 >> 16 ) & 0xFF ] ^   \
-                 RT3[ ( Y1 >> 24 ) & 0xFF ];    \
-                                                \
-    X1 = *RK++ ^ RT0[ ( Y1       ) & 0xFF ] ^   \
-                 RT1[ ( Y0 >>  8 ) & 0xFF ] ^   \
-                 RT2[ ( Y3 >> 16 ) & 0xFF ] ^   \
-                 RT3[ ( Y2 >> 24 ) & 0xFF ];    \
-                                                \
-    X2 = *RK++ ^ RT0[ ( Y2       ) & 0xFF ] ^   \
-                 RT1[ ( Y1 >>  8 ) & 0xFF ] ^   \
-                 RT2[ ( Y0 >> 16 ) & 0xFF ] ^   \
-                 RT3[ ( Y3 >> 24 ) & 0xFF ];    \
-                                                \
-    X3 = *RK++ ^ RT0[ ( Y3       ) & 0xFF ] ^   \
-                 RT1[ ( Y2 >>  8 ) & 0xFF ] ^   \
-                 RT2[ ( Y1 >> 16 ) & 0xFF ] ^   \
-                 RT3[ ( Y0 >> 24 ) & 0xFF ];    \
+#define AES_RROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3)         \
+{                                                   \
+    X0 = *RK++ ^ AES_RT0( ( Y0       ) & 0xFF ) ^   \
+                 AES_RT1( ( Y3 >>  8 ) & 0xFF ) ^   \
+                 AES_RT2( ( Y2 >> 16 ) & 0xFF ) ^   \
+                 AES_RT3( ( Y1 >> 24 ) & 0xFF );    \
+                                                    \
+    X1 = *RK++ ^ AES_RT0( ( Y1       ) & 0xFF ) ^   \
+                 AES_RT1( ( Y0 >>  8 ) & 0xFF ) ^   \
+                 AES_RT2( ( Y3 >> 16 ) & 0xFF ) ^   \
+                 AES_RT3( ( Y2 >> 24 ) & 0xFF );    \
+                                                    \
+    X2 = *RK++ ^ AES_RT0( ( Y2       ) & 0xFF ) ^   \
+                 AES_RT1( ( Y1 >>  8 ) & 0xFF ) ^   \
+                 AES_RT2( ( Y0 >> 16 ) & 0xFF ) ^   \
+                 AES_RT3( ( Y3 >> 24 ) & 0xFF );    \
+                                                    \
+    X3 = *RK++ ^ AES_RT0( ( Y3       ) & 0xFF ) ^   \
+                 AES_RT1( ( Y2 >>  8 ) & 0xFF ) ^   \
+                 AES_RT2( ( Y1 >> 16 ) & 0xFF ) ^   \
+                 AES_RT3( ( Y0 >> 24 ) & 0xFF );    \
 }
 
 /*
diff --git a/library/version_features.c b/library/version_features.c
index 9f97c7b..2b65199 100644
--- a/library/version_features.c
+++ b/library/version_features.c
@@ -198,6 +198,9 @@
 #if defined(MBEDTLS_AES_ROM_TABLES)
     "MBEDTLS_AES_ROM_TABLES",
 #endif /* MBEDTLS_AES_ROM_TABLES */
+#if defined(MBEDTLS_AES_SMALL_TABLES)
+    "MBEDTLS_AES_SMALL_TABLES",
+#endif /* MBEDTLS_AES_SMALL_TABLES */
 #if defined(MBEDTLS_CAMELLIA_SMALL_MEMORY)
     "MBEDTLS_CAMELLIA_SMALL_MEMORY",
 #endif /* MBEDTLS_CAMELLIA_SMALL_MEMORY */