run GHASH of AAD and first AES permutation in parallel
diff --git a/lib/fusion.c b/lib/fusion.c
index a4535fd..514e03d 100644
--- a/lib/fusion.c
+++ b/lib/fusion.c
@@ -290,9 +290,6 @@
__m128i gdatabuf[6];
__m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), bswap8);
- const __m128i *gdata; // points to the elements fed into GHASH
- size_t gdata_cnt;
-
// src and dst are updated after the chunk is processed
const __m128i *src = input;
__m128i *dst = output;
@@ -314,16 +311,40 @@
ctr = _mm_insert_epi32(ctr, 1, 0);
ek0 = _mm_shuffle_epi8(ctr, bswap8);
- { /* prepare the first bit stream */
- size_t i;
- AESECB6_INIT();
- for (i = 1; i < ctx->ecb.rounds; ++i)
- AESECB6_UPDATE(i);
- AESECB6_FINAL(i);
+ /* start preparing AES */
+ AESECB6_INIT();
+ AESECB6_UPDATE(1);
+
+ /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */
+ const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH
+ size_t gdata_cnt = 0;
+ if (PTLS_LIKELY(aadlen != 0)) {
+ while (gdata_cnt < 6) {
+ if (PTLS_LIKELY(aadlen < 16)) {
+ if (aadlen != 0) {
+ gdatabuf[gdata_cnt++] = loadn(aad, aadlen);
+ aadlen = 0;
+ }
+ goto MainLoop;
+ }
+ gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
+ aadlen -= 16;
+ }
}
/* the main loop */
+MainLoop:
while (1) {
+ /* run AES and multiplication in parallel */
+ size_t i;
+ for (i = 2; i < gdata_cnt + 2; ++i) {
+ AESECB6_UPDATE(i);
+ gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
+ }
+ for (; i < ctx->ecb.rounds; ++i)
+ AESECB6_UPDATE(i);
+ AESECB6_FINAL(i);
+
/* apply the bit stream to src and write to dest */
if (PTLS_LIKELY(srclen >= 6 * 16)) {
#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i))
@@ -418,16 +439,6 @@
}
gdata = gdatabuf;
}
-
- /* run AES and multiplication in parallel */
- size_t i;
- for (i = 2; i <= 7; ++i) {
- AESECB6_UPDATE(i);
- gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
- }
- for (; i < ctx->ecb.rounds; ++i)
- AESECB6_UPDATE(i);
- AESECB6_FINAL(i);
}
Finish: