Document and slightly reorganize mod_pXXX
diff --git a/library/ecp.c b/library/ecp.c
index 21a2315..0f21e2e 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -483,9 +483,20 @@
 }
 
 #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
+/*
+ * Compared to the way things are presented in FIPS 186-3 D.2,
+ * we proceed in columns, from right (least significant chunk) to left,
+ * adding chunks to N in place, and keeping a carry for the next chunk.
+ * This avoids moving things around in memory, and uselessly adding zeros,
+ * compared to the more straightforward, line-oriented approach.
+ *
+ * For this prime we need to handle data in chunks of 64 bits.
+ * Since this is always a multiple of our basic t_uint, we can
+ * use a t_uint * to designate such a chunk, and small loops to handle it.
+ */
 
 /* Add 64-bit chunks (dst += src) and update carry */
-static inline void add_64( t_uint *dst, t_uint *src, t_uint *carry )
+static inline void add64( t_uint *dst, t_uint *src, t_uint *carry )
 {
     unsigned char i;
     t_uint c = 0;
@@ -508,11 +519,11 @@
     }
 }
 
-#define OFFSET      ( 8 / sizeof( t_uint ) )
-#define A( i )      ( N->p + ( i ) * OFFSET )
-#define ADD( i )    add_64( p, A( i ), &c )
-#define NEXT        p += OFFSET; carry64( p, &c )
-#define LAST        p += OFFSET; *p = c; while( ++p < end ) *p = 0
+#define WIDTH       8 / sizeof( t_uint )
+#define A( i )      N->p + i * WIDTH
+#define ADD( i )    add64( p, A( i ), &c )
+#define NEXT        p += WIDTH; carry64( p, &c )
+#define LAST        p += WIDTH; *p = c; while( ++p < end ) *p = 0
 
 /*
  * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1)
@@ -523,8 +534,9 @@
     t_uint c = 0;
     t_uint *p, *end;
 
-    /* Make sure we have the correct number of blocks */
-    MPI_CHK( mpi_grow( N, 6 * OFFSET ) );
+    /* Make sure we have enough blocks so that A(5) is legal */
+    MPI_CHK( mpi_grow( N, 6 * WIDTH ) );
+
     p = N->p;
     end = p + N->n;
 
@@ -536,28 +548,35 @@
     return( ret );
 }
 
-#undef OFFSET
+#undef WIDTH
 #undef A
 #undef ADD
 #undef NEXT
 #undef LAST
 #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */
 
-#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
+/*
+ * The reader is advised to first understand ecp_mod_p192() since the same
+ * general structure is used here, but with additional complications:
+ * (1) chunks of 32 bits, and (2) subtractions.
+ */
 
-static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
-{
-    *dst += src;
-    *carry += ( *dst < src );
-}
+/*
+ * For these primes, we need to handle data in chunks of 32 bits.
+ * This makes it more complicated if we use 64 bits limbs in MPI,
+ * which prevents us from using a uniform access method as for p192.
+ *
+ * So, we define a mini abstraction layer to access 32 bit chunks,
+ * load them in 'cur' for work, and store them back from 'cur' when done.
+ *
+ * While at it, also define the size of N in terms of 32-bit chunks.
+ */
+#define LOAD32      cur = A( i );
 
-static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
-{
-    *carry -= ( *dst < src );
-    *dst -= src;
-}
-
-#if defined(POLARSSL_HAVE_INT8)
+#if defined(POLARSSL_HAVE_INT8)     /* 8 bit */
 
 #define MAX32       N->n / 4
 #define A( j )      (uint32_t)( N->p[4*j+0]       ) |  \
@@ -569,20 +588,20 @@
                     N->p[4*i+2] = (uint8_t)( cur >> 16 );   \
                     N->p[4*i+3] = (uint8_t)( cur >> 24 );
 
-#elif defined(POLARSSL_HAVE_INT16)
+#elif defined(POLARSSL_HAVE_INT16)  /* 16 bit */
 
 #define MAX32       N->n / 2
 #define A( j )      (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 )
 #define STORE32     N->p[2*i+0] = (uint16_t)( cur       );  \
                     N->p[2*i+1] = (uint16_t)( cur >> 16 );
 
-#elif defined(POLARSSL_HAVE_INT32)
+#elif defined(POLARSSL_HAVE_INT32)  /* 32 bit */
 
 #define MAX32       N->n
 #define A( j )      N->p[j]
 #define STORE32     N->p[i] = cur;
 
-#else /* 64-bit */
+#else                               /* 64-bit */
 
 #define MAX32       N->n * 2
 #define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] )
@@ -595,14 +614,37 @@
         N->p[i/2] |= (uint64_t) cur;              \
     }
 
-#endif
+#endif /* sizeof( t_uint ) */
+
+/*
+ * Helpers for addition and subtraction of chunks, with signed carry.
+ */
+static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *dst += src;
+    *carry += ( *dst < src );
+}
+
+static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *carry -= ( *dst < src );
+    *dst -= src;
+}
 
 #define ADD( j )    add32( &cur, A( j ), &c );
 #define SUB( j )    sub32( &cur, A( j ), &c );
 
-#define LOAD32      cur = A( i );
-
-#define FIRST       c = 0; i = 0; LOAD32;
+/*
+ * Helpers for the main 'loop'
+ */
+#define INIT( b )                                           \
+    int ret;                                                \
+    signed char c = 0, cc;                                  \
+    uint32_t cur;                                           \
+    size_t i = 0, bits = b;                                 \
+                                                            \
+    MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \
+    LOAD32;
 
 #define NEXT                    \
     STORE32; i++; LOAD32;       \
@@ -638,22 +680,18 @@
 
     return( ret );
 }
+#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP256R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP384R1_ENABLED */
 
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
 /*
  * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2)
  */
 static int ecp_mod_p224( mpi *N )
 {
-    int ret;
-    signed char c, cc;
-    uint32_t cur;
-    size_t i;
-    size_t bits = 224;
+    INIT( 224 );
 
-    /* Make sure we have enough blocks */
-    MPI_CHK( mpi_grow( N, bits * 2 / 8 / sizeof( t_uint ) ) );
-
-    FIRST;
     SUB(  7 ); SUB( 11 );               NEXT; // A0 += -A7 - A11
     SUB(  8 ); SUB( 12 );               NEXT; // A1 += -A8 - A12
     SUB(  9 ); SUB( 13 );               NEXT; // A2 += -A9 - A13
@@ -667,15 +705,32 @@
 }
 #endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */
 
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
+
+#undef A
+#undef LOAD32
+#undef STORE32
+#undef MAX32
+#undef INIT
+#undef NEXT
+#undef LAST
+
+#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP256R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP384R1_ENABLED */
+
 #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
 /*
- * Size of p521 in terms of t_uint
+ * Here we have a real Mersenne prime, so things are more straightforward.
+ * However, things are aligned on a 'weird' boundary (521 bits).
  */
-#define P521_SIZE_INT   ( 521 / 8 / sizeof( t_uint ) + 1 )
 
-/*
- * Bits to keep in the most significant t_uint
- */
+/* Size of p521 in terms of t_uint */
+#define P521_WIDTH      ( 521 / 8 / sizeof( t_uint ) + 1 )
+
+/* Bits to keep in the most significant t_uint */
 #if defined(POLARSSL_HAVE_INT8)
 #define P521_MASK       0x01
 #else
@@ -691,26 +746,26 @@
     int ret;
     size_t i;
     mpi M;
-    t_uint Mp[P521_SIZE_INT+1];
-    /* Worst case for the size of M is when sizeof( t_uint ) == 16:
+    t_uint Mp[P521_WIDTH + 1];
+    /* Worst case for the size of M is when t_uint is 16 bits:
      * we need to hold bits 513 to 1056, which is 34 limbs, that is
-     * P521_SIZE_INT + 1. Otherwise P521_SIZE is enough. */
+     * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */
 
-    if( N->n < P521_SIZE_INT )
+    if( N->n < P521_WIDTH )
         return( 0 );
 
     /* M = A1 */
     M.s = 1;
-    M.n = N->n - ( P521_SIZE_INT - 1 );
-    if( M.n > P521_SIZE_INT + 1 )
-        M.n = P521_SIZE_INT + 1;
+    M.n = N->n - ( P521_WIDTH - 1 );
+    if( M.n > P521_WIDTH + 1 )
+        M.n = P521_WIDTH + 1;
     M.p = Mp;
-    memcpy( Mp, N->p + P521_SIZE_INT - 1, M.n * sizeof( t_uint ) );
+    memcpy( Mp, N->p + P521_WIDTH - 1, M.n * sizeof( t_uint ) );
     MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) );
 
     /* N = A0 */
-    N->p[P521_SIZE_INT - 1] &= P521_MASK;
-    for( i = P521_SIZE_INT; i < N->n; i++ )
+    N->p[P521_WIDTH - 1] &= P521_MASK;
+    for( i = P521_WIDTH; i < N->n; i++ )
         N->p[i] = 0;
 
     /* N = A0 + A1 */
@@ -719,6 +774,9 @@
 cleanup:
     return( ret );
 }
+
+#undef P521_WIDTH
+#undef P521_MASK
 #endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */
 
 /*