it works
diff --git a/lib/fusion.c b/lib/fusion.c
index d82a85a..31d8339 100644
--- a/lib/fusion.c
+++ b/lib/fusion.c
@@ -38,6 +38,7 @@
* IN THE SOFTWARE.
*/
#include <stdint.h>
+#include <string.h>
#include <tmmintrin.h>
#include <wmmintrin.h>
#include "picotls.h"
@@ -299,11 +300,224 @@
#undef FUNC
}
+static __m128i ghashn(ptls_fusion_aesgcm_context_t *ctx, const __m128i *src, size_t cnt, __m128i ghash)
+{
+ __m128i hi = _mm_setzero_si128(), lo = _mm_setzero_si128(), mid = _mm_setzero_si128();
+ assert(cnt <= 6);
+
+ for (size_t i = 0; i < cnt; ++i) {
+ __m128i X = _mm_loadu_si128(src + cnt - 1 - i);
+ X = _mm_shuffle_epi8(X, bswap8);
+ if (i == cnt - 1)
+ X = _mm_xor_si128(X, ghash);
+ __m128i t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x00);
+ lo = _mm_xor_si128(lo, t);
+ t = _mm_clmulepi64_si128(ctx->ghash[i].H, X, 0x11);
+ hi = _mm_xor_si128(hi, t);
+ t = _mm_shuffle_epi32(X, 78);
+ t = _mm_xor_si128(t, X);
+ t = _mm_clmulepi64_si128(ctx->ghash[i].r, t, 0x00);
+ mid = _mm_xor_si128(mid, t);
+ }
+
+ mid = _mm_xor_si128(mid, hi);
+ mid = _mm_xor_si128(mid, lo);
+ lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8));
+ hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8));
+
+ /* from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */
+ __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10);
+ lo = _mm_shuffle_epi32(lo, 78);
+ lo = _mm_xor_si128(lo, r);
+ r = _mm_clmulepi64_si128(lo, poly, 0x10);
+ lo = _mm_shuffle_epi32(lo, 78);
+ lo = _mm_xor_si128(lo, r);
+ ghash = _mm_xor_si128(hi, lo);
+
+ return ghash;
+}
+
static inline __m128i aesecb6ghash6(ptls_fusion_aesgcm_context_t *ctx, __m128i *data, const __m128i *gdata, __m128i ghash)
{
GHASH6(AESECB6);
}
+static inline __m128i loadn(const void *_p, size_t l)
+{
+ const uint8_t *p = _p;
+ uint8_t buf[16] = {};
+
+ for (size_t i = 0; i != l; ++i)
+ buf[i] = p[i];
+ return *(__m128i *)buf;
+}
+
+static inline void storen(void *_p, size_t l, __m128i v)
+{
+ uint8_t buf[16], *p = _p;
+
+ *(__m128i *)buf = v;
+
+ for (size_t i = 0; i != l; ++i)
+ p[i] = buf[i];
+}
+
+static inline void finish_gcm(ptls_fusion_aesgcm_context_t *ctx, __m128i *dst, const __m128i *dst_ghash, const __m128i *aad, size_t aadlen,
+ __m128i ghash, __m128i ac, __m128i ek0)
+{
+ const __m128i *enc = dst_ghash;
+ size_t enclen = (const uint8_t *)dst - (const uint8_t *)enc;
+ __m128i gdata[6];
+ int gdata_index;
+
+ while (1) {
+ gdata_index = 0;
+ if (aadlen != 0) {
+ while (aadlen >= 16) {
+ gdata[gdata_index++] = *aad++;
+ aadlen -= 16;
+ if (gdata_index == 6)
+ goto GHASH6;
+ }
+ if (aadlen != 0) {
+ gdata[gdata_index++] = loadn(aad, aadlen);
+ aadlen = 0;
+ if (gdata_index == 6)
+ goto GHASH6;
+ }
+ }
+ if (enclen != 0) {
+ while (enclen >= 16) {
+ gdata[gdata_index++] = *enc++;
+ enclen -= 16;
+ if (gdata_index == 6)
+ goto GHASH6;
+ }
+ if (enclen != 0) {
+ gdata[gdata_index++] = loadn(enc, enclen);
+ enclen = 0;
+ if (gdata_index == 6)
+ goto GHASH6;
+ }
+ }
+ __m128i bswap64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+ gdata[gdata_index++] = _mm_shuffle_epi8(ac, bswap64);
+ break;
+
+ GHASH6:
+ ghash = ghash6(ctx, gdata, ghash);
+ }
+
+ /* final */
+#if 0
+ for (int i = 0; i < gdata_index; ++i)
+ ghash = gfmul(_mm_xor_si128(ghash, _mm_shuffle_epi8(gdata[i], bswap8)), ctx->ghash[0].H);
+#else
+ ghash = ghashn(ctx, gdata, gdata_index, ghash);
+#endif
+ __m128i tag = _mm_shuffle_epi8(ghash, bswap8);
+ tag = _mm_xor_si128(tag, ek0);
+ _mm_storeu_si128(dst, tag);
+}
+
+void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, const void *iv, const void *_aad, size_t aadlen, void *_dst,
+ const void *_src, size_t srclen)
+{
+ __m128i bswap64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7), one = _mm_set_epi32(0, 1, 0, 0);
+ __m128i ctr, ek0, bits[6], gdatabuf[6], ghash = _mm_setzero_si128();
+ int ek0_encrypted = 0;
+ __m128i ac = _mm_set_epi32(0, (int)srclen * 8, 0, (int)aadlen * 8);
+
+ // src and dst are updated after the chunk is processed
+ const __m128i *src = _src;
+ __m128i *dst = _dst;
+ // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor)
+ const __m128i *aad = _aad, *dst_ghash = dst;
+
+ /* build counter */
+ ctr = loadn(iv, PTLS_AESGCM_IV_SIZE);
+ ctr = _mm_shuffle_epi8(ctr, bswap8);
+ ctr = _mm_add_epi64(ctr, one);
+ ek0 = _mm_shuffle_epi8(ctr, bswap64);
+
+/* setup the counters (we can always run in full), but use the last slot for calculating ek0, if possible */
+#define SETUP_BITS() \
+ do { \
+ for (int i = 0; i < 5; ++i) { \
+ ctr = _mm_add_epi64(ctr, one); \
+ bits[i] = _mm_shuffle_epi8(ctr, bswap64); \
+ } \
+ if (PTLS_LIKELY(srclen > 16 * 5)) { \
+ ctr = _mm_add_epi64(ctr, one); \
+ bits[5] = _mm_shuffle_epi8(ctr, bswap64); \
+ } else { \
+ assert(!ek0_encrypted); \
+ bits[5] = ek0; \
+ ek0_encrypted = 1; \
+ } \
+ } while (0)
+
+ /* build the first AES bits */
+ SETUP_BITS();
+ aesecb6(ctx, bits);
+
+ /* the main loop */
+ while (PTLS_LIKELY(srclen >= 6 * 16)) {
+ /* apply the bits */
+ for (int i = 0; i < 6; ++i)
+ _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits[i]));
+ srclen -= 6 * 16;
+
+ /* setup bits */
+ SETUP_BITS();
+
+ /* setup gdata */
+ const __m128i *gdata;
+ if (PTLS_UNLIKELY(aadlen != 0)) {
+ for (int i = 0; i < 6; ++i) {
+ if (aadlen < 16) {
+ if (aadlen != 0) {
+ gdatabuf[i++] = loadn(aad, aadlen);
+ aadlen = 0;
+ }
+ while (i < 6)
+ gdatabuf[i++] = *dst_ghash++;
+ break;
+ }
+ gdatabuf[i++] = _mm_loadu_si128(aad++);
+ aadlen -= 16;
+ }
+ gdata = gdatabuf;
+ } else {
+ gdata = dst_ghash;
+ dst_ghash += 6;
+ }
+
+ /* doit */
+ ghash = aesecb6ghash6(ctx, bits, gdata, ghash);
+ }
+
+ /* apply the bit stream to the remainder */
+ for (int i = 0; i < 6 && srclen != 0; ++i) {
+ if (srclen < 16) {
+ storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits[i]));
+ dst = (__m128i *)((uint8_t *)dst + srclen);
+ srclen = 0;
+ break;
+ }
+ _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits[i]));
+ srclen -= 16;
+ }
+
+ if (ek0_encrypted) {
+ ek0 = bits[5];
+ } else {
+ assert(!"FIXME calculate ek0");
+ }
+
+ finish_gcm(ctx, dst, dst_ghash, aad, aadlen, ghash, ac, ek0);
+}
+
static __m128i expand_key(__m128i key, __m128i t)
{
t = _mm_shuffle_epi32(t, _MM_SHUFFLE(3, 3, 3, 3));