ChaCha20: allow in-place en/decryption

All other ciphers so far allow this. In particular, the TLS layer depends on
this, despite what's documented in the Cipher layer, see
https://github.com/ARMmbed/mbedtls/issues/1085
https://github.com/ARMmbed/mbedtls/issues/1087

Also, this can be useful for implementing chachapoly without depending on the
semi-internal function keystream_block(), see next commit.
diff --git a/include/mbedtls/chacha20.h b/include/mbedtls/chacha20.h
index f88bd28..7999702 100644
--- a/include/mbedtls/chacha20.h
+++ b/include/mbedtls/chacha20.h
@@ -133,9 +133,8 @@
  *
  *                  This function is used to both encrypt and decrypt data.
  *
- * \note            The \p input and \p output buffers may overlap, but only
- *                  if input >= output (i.e. only if input points ahead of
- *                  the output pointer).
+ * \note            The \p input and \p output pointers must either be equal or
+ *                  point to non-overlapping buffers.
  *
  * \note            mbedtls_chacha20_setkey and mbedtls_chacha20_starts must be
  *                  called at least once to setup the context before this function
diff --git a/library/chacha20.c b/library/chacha20.c
index 28133a6..1abb96e 100644
--- a/library/chacha20.c
+++ b/library/chacha20.c
@@ -314,23 +314,22 @@
     /* Process full blocks */
     while ( size >= CHACHA20_BLOCK_SIZE_BYTES )
     {
-        mbedtls_chacha20_block( ctx->initial_state, ctx->working_state, &output[offset] );
+        /* Generate new keystream block and increment counter */
+        mbedtls_chacha20_block( ctx->initial_state, ctx->working_state, ctx->keystream8 );
+        ctx->initial_state[CHACHA20_CTR_INDEX]++;
 
         for ( i = 0U; i < 64U; i += 8U )
         {
-            output[offset + i     ] ^= input[offset + i     ];
-            output[offset + i + 1U] ^= input[offset + i + 1U];
-            output[offset + i + 2U] ^= input[offset + i + 2U];
-            output[offset + i + 3U] ^= input[offset + i + 3U];
-            output[offset + i + 4U] ^= input[offset + i + 4U];
-            output[offset + i + 5U] ^= input[offset + i + 5U];
-            output[offset + i + 6U] ^= input[offset + i + 6U];
-            output[offset + i + 7U] ^= input[offset + i + 7U];
+            output[offset + i      ] = input[offset + i      ] ^ ctx->keystream8[i      ];
+            output[offset + i + 1U ] = input[offset + i + 1U ] ^ ctx->keystream8[i + 1U ];
+            output[offset + i + 2U ] = input[offset + i + 2U ] ^ ctx->keystream8[i + 2U ];
+            output[offset + i + 3U ] = input[offset + i + 3U ] ^ ctx->keystream8[i + 3U ];
+            output[offset + i + 4U ] = input[offset + i + 4U ] ^ ctx->keystream8[i + 4U ];
+            output[offset + i + 5U ] = input[offset + i + 5U ] ^ ctx->keystream8[i + 5U ];
+            output[offset + i + 6U ] = input[offset + i + 6U ] ^ ctx->keystream8[i + 6U ];
+            output[offset + i + 7U ] = input[offset + i + 7U ] ^ ctx->keystream8[i + 7U ];
         }
 
-        /* Increment counter */
-        ctx->initial_state[CHACHA20_CTR_INDEX]++;
-
         offset += CHACHA20_BLOCK_SIZE_BYTES;
         size   -= CHACHA20_BLOCK_SIZE_BYTES;
     }
@@ -338,7 +337,9 @@
     /* Last (partial) block */
     if ( size > 0U )
     {
+        /* Generate new keystream block and increment counter */
         mbedtls_chacha20_block( ctx->initial_state, ctx->working_state, ctx->keystream8 );
+        ctx->initial_state[CHACHA20_CTR_INDEX]++;
 
         for ( i = 0U; i < size; i++)
         {
@@ -347,8 +348,6 @@
 
         ctx->keystream_bytes_used = size;
 
-        /* Increment counter */
-        ctx->initial_state[CHACHA20_CTR_INDEX]++;
     }
 
     return( 0 );