blob: 642de8fbba7370d5a05b268aff1432cb5161997e [file] [log] [blame]
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001/*
2 * This source file is licensed under the Apache License 2.0 *and* the MIT
3 * License. Please agree to *both* of the licensing terms!
4 *
5 *
6 * `transformH` function is a derivative work of OpenSSL. The original work
7 * is covered by the following license:
8 *
9 * Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
10 *
11 * Licensed under the Apache License 2.0 (the "License"). You may not use
12 * this file except in compliance with the License. You can obtain a copy
13 * in the file LICENSE in the source distribution or at
14 * https://www.openssl.org/source/license.html
15 *
16 *
17 * All other work, including modifications to the `transformH` function is
18 * covered by the following MIT license:
19 *
20 * Copyright (c) 2020 Fastly, Kazuho Oku
21 *
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this software and associated documentation files (the "Software"), to
24 * deal in the Software without restriction, including without limitation the
25 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
26 * sell copies of the Software, and to permit persons to whom the Software is
27 * furnished to do so, subject to the following conditions:
28 *
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
31 *
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 * IN THE SOFTWARE.
39 */
40#include <stdint.h>
Kazuho Oku14c00c02020-09-12 20:48:25 +090041
Kazuho Okuf198c1b2020-05-08 00:45:29 +090042#include <stdlib.h>
Kazuho Okufa13ede2020-05-06 16:19:57 +090043#include <string.h>
Kazuho Oku3604f8b2020-05-15 04:24:27 +090044#include <immintrin.h>
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090045#include <tmmintrin.h>
Kazuho Oku02ca0f02020-05-13 20:46:39 +090046#include <nmmintrin.h>
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090047#include <wmmintrin.h>
48#include "picotls.h"
49#include "picotls/fusion.h"
50
Kazuho Okuf198c1b2020-05-08 00:45:29 +090051struct ptls_fusion_aesgcm_context {
Kazuho Okue46529c2020-05-08 13:38:39 +090052 ptls_fusion_aesecb_context_t ecb;
Kazuho Oku7fd7c842020-05-18 14:04:42 +090053 size_t capacity;
Kazuho Okuf198c1b2020-05-08 00:45:29 +090054 size_t ghash_cnt;
55 struct ptls_fusion_aesgcm_ghash_precompute {
56 __m128i H;
57 __m128i r;
58 } ghash[0];
59};
60
Kazuho Oku1cf91f62020-05-13 15:11:14 +090061struct ctr_context {
62 ptls_cipher_context_t super;
63 ptls_fusion_aesecb_context_t fusion;
64 __m128i bits;
65 uint8_t is_ready;
66};
67
68struct aesgcm_context {
69 ptls_aead_context_t super;
70 ptls_fusion_aesgcm_context_t *aesgcm;
Kazuho Okuba2b9602020-05-14 08:21:39 +090071 /**
72 * retains the static IV in the upper 96 bits (in little endian)
73 */
74 __m128i static_iv;
Kazuho Oku1cf91f62020-05-13 15:11:14 +090075};
76
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090077static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000};
78#define poly (*(__m128i *)poly_)
79static const uint8_t bswap8_[16] __attribute__((aligned(16))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
80#define bswap8 (*(__m128i *)bswap8_)
Kazuho Okuba2b9602020-05-14 08:21:39 +090081static const uint8_t one8_[16] __attribute__((aligned(16))) = {1};
82#define one8 (*(__m128i *)one8_)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090083
Kazuho Okuae95e4c2020-05-11 06:27:27 +090084/* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl
85 * at commit 33388b4. */
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090086static __m128i transformH(__m128i H)
87{
88 // # <<1 twist
89 // pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
90 __m128i t2 = _mm_shuffle_epi32(H, 0xff);
91 // movdqa $Hkey,$T1
92 __m128i t1 = H;
93 // psllq \$1,$Hkey
94 H = _mm_slli_epi64(H, 1);
95 // pxor $T3,$T3 #
96 __m128i t3 = _mm_setzero_si128();
97 // psrlq \$63,$T1
98 t1 = _mm_srli_epi64(t1, 63);
99 // pcmpgtd $T2,$T3 # broadcast carry bit
100 t3 = _mm_cmplt_epi32(t2, t3);
101 // pslldq \$8,$T1
102 t1 = _mm_slli_si128(t1, 8);
103 // por $T1,$Hkey # H<<=1
104 H = _mm_or_si128(t1, H);
105
106 // # magic reduction
107 // pand .L0x1c2_polynomial(%rip),$T3
108 t3 = _mm_and_si128(t3, poly);
109 // pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
110 H = _mm_xor_si128(t3, H);
111
112 return H;
113}
114// end of Apache License code
115
116static __m128i gfmul(__m128i x, __m128i y)
117{
118 __m128i lo = _mm_clmulepi64_si128(x, y, 0x00);
119 __m128i hi = _mm_clmulepi64_si128(x, y, 0x11);
120
121 __m128i a = _mm_shuffle_epi32(x, 78);
122 __m128i b = _mm_shuffle_epi32(y, 78);
123 a = _mm_xor_si128(a, x);
124 b = _mm_xor_si128(b, y);
125
126 a = _mm_clmulepi64_si128(a, b, 0x00);
127 a = _mm_xor_si128(a, lo);
128 a = _mm_xor_si128(a, hi);
129
130 b = _mm_slli_si128(a, 8);
131 a = _mm_srli_si128(a, 8);
132
133 lo = _mm_xor_si128(lo, b);
134 hi = _mm_xor_si128(hi, a);
135
136 // from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
137 __m128i t = _mm_clmulepi64_si128(lo, poly, 0x10);
138 lo = _mm_shuffle_epi32(lo, 78);
139 lo = _mm_xor_si128(lo, t);
140 t = _mm_clmulepi64_si128(lo, poly, 0x10);
141 lo = _mm_shuffle_epi32(lo, 78);
142 lo = _mm_xor_si128(lo, t);
143
144 return _mm_xor_si128(hi, lo);
145}
146
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900147struct ptls_fusion_gfmul_state {
148 __m128i hi, lo, mid;
149};
150
151static inline void gfmul_onestep(struct ptls_fusion_gfmul_state *gstate, __m128i X,
152 struct ptls_fusion_aesgcm_ghash_precompute *precompute)
153{
154 X = _mm_shuffle_epi8(X, bswap8);
155 __m128i t = _mm_clmulepi64_si128(precompute->H, X, 0x00);
156 gstate->lo = _mm_xor_si128(gstate->lo, t);
157 t = _mm_clmulepi64_si128(precompute->H, X, 0x11);
158 gstate->hi = _mm_xor_si128(gstate->hi, t);
159 t = _mm_shuffle_epi32(X, 78);
160 t = _mm_xor_si128(t, X);
161 t = _mm_clmulepi64_si128(precompute->r, t, 0x00);
162 gstate->mid = _mm_xor_si128(gstate->mid, t);
163}
164
165static inline __m128i gfmul_final(struct ptls_fusion_gfmul_state *gstate, __m128i ek0)
166{
167 /* finish multiplication */
168 gstate->mid = _mm_xor_si128(gstate->mid, gstate->hi);
169 gstate->mid = _mm_xor_si128(gstate->mid, gstate->lo);
170 gstate->lo = _mm_xor_si128(gstate->lo, _mm_slli_si128(gstate->mid, 8));
171 gstate->hi = _mm_xor_si128(gstate->hi, _mm_srli_si128(gstate->mid, 8));
172
173 /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */
174 __m128i r = _mm_clmulepi64_si128(gstate->lo, poly, 0x10);
175 gstate->lo = _mm_shuffle_epi32(gstate->lo, 78);
176 gstate->lo = _mm_xor_si128(gstate->lo, r);
177 r = _mm_clmulepi64_si128(gstate->lo, poly, 0x10);
178 gstate->lo = _mm_shuffle_epi32(gstate->lo, 78);
179 gstate->lo = _mm_xor_si128(gstate->lo, r);
180 __m128i tag = _mm_xor_si128(gstate->hi, gstate->lo);
181 tag = _mm_shuffle_epi8(tag, bswap8);
182 tag = _mm_xor_si128(tag, ek0);
183
184 return tag;
185}
186
Kazuho Oku94feca22020-05-11 16:34:44 +0900187static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v)
188{
189 size_t i;
190
191 v = _mm_xor_si128(v, ctx->keys[0]);
Kazuho Oku4c19f502020-05-15 08:30:35 +0900192 for (i = 1; i < ctx->rounds; ++i)
Kazuho Oku94feca22020-05-11 16:34:44 +0900193 v = _mm_aesenc_si128(v, ctx->keys[i]);
194 v = _mm_aesenclast_si128(v, ctx->keys[i]);
195
196 return v;
197}
198
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900199static const uint8_t loadn_mask[31] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
200 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
201static const uint8_t loadn_shuffle[31] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
202 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // first 16 bytes map to byte offsets
203 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
204 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // latter 15 bytes map to zero
205
Kazuho Oku331b2372021-12-02 15:25:35 +0900206#if defined(__clang__)
207#if __has_feature(address_sanitizer)
Goro Fuji3d161542021-12-02 05:30:47 +0000208__attribute__((no_sanitize("address")))
Kazuho Oku331b2372021-12-02 15:25:35 +0900209#endif
210#elif __SANITIZE_ADDRESS__ /* gcc */
211__attribute__((no_sanitize_address))
212#endif
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900213static inline __m128i loadn(const void *p, size_t l)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900214{
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900215 __m128i v, mask = _mm_loadu_si128((__m128i *)(loadn_mask + 16 - l));
216 uintptr_t mod4k = (uintptr_t)p % 4096;
Goro Fuji9a99cf12021-11-30 12:07:24 +0000217
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900218 if (PTLS_LIKELY(mod4k <= 4080) || mod4k + l > 4096) {
219 v = _mm_loadu_si128(p);
Kazuho Oku079b1d02020-05-14 02:24:28 +0900220 } else {
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900221 uintptr_t shift = (uintptr_t)p & 15;
222 __m128i pattern = _mm_loadu_si128((const __m128i *)(loadn_shuffle + shift));
223 v = _mm_shuffle_epi8(_mm_load_si128((const __m128i *)((uintptr_t)p - shift)), pattern);
Kazuho Oku079b1d02020-05-14 02:24:28 +0900224 }
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900225 v = _mm_and_si128(v, mask);
226 return v;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900227}
228
229static inline void storen(void *_p, size_t l, __m128i v)
230{
231 uint8_t buf[16], *p = _p;
232
233 *(__m128i *)buf = v;
234
235 for (size_t i = 0; i != l; ++i)
236 p[i] = buf[i];
237}
238
Kazuho Okuba2b9602020-05-14 08:21:39 +0900239void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, __m128i ctr,
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900240 const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900241{
Kazuho Oku274a5722020-05-07 22:56:07 +0900242/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
243#define AESECB6_INIT() \
244 do { \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900245 ctr = _mm_add_epi64(ctr, one8); \
246 bits0 = _mm_shuffle_epi8(ctr, bswap8); \
247 ctr = _mm_add_epi64(ctr, one8); \
248 bits1 = _mm_shuffle_epi8(ctr, bswap8); \
249 ctr = _mm_add_epi64(ctr, one8); \
250 bits2 = _mm_shuffle_epi8(ctr, bswap8); \
251 ctr = _mm_add_epi64(ctr, one8); \
252 bits3 = _mm_shuffle_epi8(ctr, bswap8); \
253 ctr = _mm_add_epi64(ctr, one8); \
254 bits4 = _mm_shuffle_epi8(ctr, bswap8); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900255 if (PTLS_LIKELY(srclen > 16 * 5)) { \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900256 ctr = _mm_add_epi64(ctr, one8); \
257 bits5 = _mm_shuffle_epi8(ctr, bswap8); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900258 } else { \
259 if ((state & STATE_EK0_BEEN_FED) == 0) { \
260 bits5 = ek0; \
261 state |= STATE_EK0_BEEN_FED; \
262 } \
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900263 if ((state & STATE_SUPP_USED) != 0 && srclen <= 16 * 4 && (const __m128i *)supp->input + 1 <= dst_ghash) { \
264 bits4 = _mm_loadu_si128(supp->input); \
265 bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys; \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900266 state |= STATE_SUPP_IN_PROCESS; \
267 } \
Kazuho Oku274a5722020-05-07 22:56:07 +0900268 } \
Kazuho Okue46529c2020-05-08 13:38:39 +0900269 __m128i k = ctx->ecb.keys[0]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900270 bits0 = _mm_xor_si128(bits0, k); \
271 bits1 = _mm_xor_si128(bits1, k); \
272 bits2 = _mm_xor_si128(bits2, k); \
273 bits3 = _mm_xor_si128(bits3, k); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900274 bits4 = _mm_xor_si128(bits4, bits4keys[0]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900275 bits5 = _mm_xor_si128(bits5, k); \
276 } while (0)
277
278/* aes block update */
279#define AESECB6_UPDATE(i) \
280 do { \
Kazuho Okue46529c2020-05-08 13:38:39 +0900281 __m128i k = ctx->ecb.keys[i]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900282 bits0 = _mm_aesenc_si128(bits0, k); \
283 bits1 = _mm_aesenc_si128(bits1, k); \
284 bits2 = _mm_aesenc_si128(bits2, k); \
285 bits3 = _mm_aesenc_si128(bits3, k); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900286 bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900287 bits5 = _mm_aesenc_si128(bits5, k); \
288 } while (0)
289
290/* aesenclast */
Kazuho Oku4c19f502020-05-15 08:30:35 +0900291#define AESECB6_FINAL(i) \
Kazuho Oku274a5722020-05-07 22:56:07 +0900292 do { \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900293 __m128i k = ctx->ecb.keys[i]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900294 bits0 = _mm_aesenclast_si128(bits0, k); \
295 bits1 = _mm_aesenclast_si128(bits1, k); \
296 bits2 = _mm_aesenclast_si128(bits2, k); \
297 bits3 = _mm_aesenclast_si128(bits3, k); \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900298 bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900299 bits5 = _mm_aesenclast_si128(bits5, k); \
300 } while (0)
301
Kazuho Okuba2b9602020-05-14 08:21:39 +0900302 __m128i ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900303 const __m128i *bits4keys = ctx->ecb.keys; /* is changed to supp->ctx.keys when calcurating suppout */
Kazuho Oku14c00c02020-09-12 20:48:25 +0900304 struct ptls_fusion_gfmul_state gstate = {0};
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900305 __m128i gdatabuf[6];
Kazuho Okuba2b9602020-05-14 08:21:39 +0900306 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), bswap8);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900307
308 // src and dst are updated after the chunk is processed
Kazuho Oku94feca22020-05-11 16:34:44 +0900309 const __m128i *src = input;
310 __m128i *dst = output;
311 size_t srclen = inlen;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900312 // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor)
313 const __m128i *aad = _aad, *dst_ghash = dst;
Kazuho Oku274a5722020-05-07 22:56:07 +0900314 size_t dst_ghashlen = srclen;
315
316 struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1;
317
Kazuho Oku274a5722020-05-07 22:56:07 +0900318#define STATE_EK0_BEEN_FED 0x3
319#define STATE_EK0_INCOMPLETE 0x2
320#define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1)
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900321#define STATE_SUPP_USED 0x4
322#define STATE_SUPP_IN_PROCESS 0x8
323 int32_t state = supp != NULL ? STATE_SUPP_USED : 0;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900324
325 /* build counter */
Kazuho Okuba2b9602020-05-14 08:21:39 +0900326 ctr = _mm_insert_epi32(ctr, 1, 0);
327 ek0 = _mm_shuffle_epi8(ctr, bswap8);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900328
Kazuho Okud8dc6992020-05-19 14:02:21 +0900329 /* start preparing AES */
330 AESECB6_INIT();
331 AESECB6_UPDATE(1);
332
333 /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */
334 const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH
335 size_t gdata_cnt = 0;
336 if (PTLS_LIKELY(aadlen != 0)) {
337 while (gdata_cnt < 6) {
338 if (PTLS_LIKELY(aadlen < 16)) {
339 if (aadlen != 0) {
340 gdatabuf[gdata_cnt++] = loadn(aad, aadlen);
341 aadlen = 0;
342 }
343 goto MainLoop;
344 }
345 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
346 aadlen -= 16;
347 }
Kazuho Oku4c19f502020-05-15 08:30:35 +0900348 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900349
350 /* the main loop */
Kazuho Okud8dc6992020-05-19 14:02:21 +0900351MainLoop:
Kazuho Oku303153d2020-05-08 16:42:16 +0900352 while (1) {
Kazuho Okud8dc6992020-05-19 14:02:21 +0900353 /* run AES and multiplication in parallel */
354 size_t i;
355 for (i = 2; i < gdata_cnt + 2; ++i) {
356 AESECB6_UPDATE(i);
357 gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
358 }
359 for (; i < ctx->ecb.rounds; ++i)
360 AESECB6_UPDATE(i);
361 AESECB6_FINAL(i);
362
Kazuho Oku274a5722020-05-07 22:56:07 +0900363 /* apply the bit stream to src and write to dest */
364 if (PTLS_LIKELY(srclen >= 6 * 16)) {
365#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i))
366 APPLY(0);
367 APPLY(1);
368 APPLY(2);
369 APPLY(3);
370 APPLY(4);
371 APPLY(5);
Kazuho Oku083f5312020-05-07 13:05:10 +0900372#undef APPLY
Kazuho Oku274a5722020-05-07 22:56:07 +0900373 dst += 6;
374 src += 6;
375 srclen -= 6 * 16;
376 } else {
377 if ((state & STATE_EK0_BEEN_FED) == STATE_EK0_BEEN_FED) {
378 ek0 = bits5;
379 state &= ~STATE_EK0_INCOMPLETE;
380 }
Kazuho Okua1a81e62020-05-09 03:46:46 +0900381 if ((state & STATE_SUPP_IN_PROCESS) != 0) {
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900382 _mm_storeu_si128((__m128i *)supp->output, bits4);
383 state &= ~(STATE_SUPP_USED | STATE_SUPP_IN_PROCESS);
Kazuho Okua1a81e62020-05-09 03:46:46 +0900384 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900385 if (srclen != 0) {
386#define APPLY(i) \
387 do { \
Kazuho Okuea21c502020-05-18 16:25:53 +0900388 if (PTLS_LIKELY(srclen >= 16)) { \
Kazuho Oku274a5722020-05-07 22:56:07 +0900389 _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \
390 srclen -= 16; \
Kazuho Okuea21c502020-05-18 16:25:53 +0900391 } else if (PTLS_LIKELY(srclen != 0)) { \
392 bits0 = bits##i; \
393 goto ApplyRemainder; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900394 } else { \
Kazuho Oku274a5722020-05-07 22:56:07 +0900395 goto ApplyEnd; \
396 } \
397 } while (0)
398 APPLY(0);
399 APPLY(1);
400 APPLY(2);
401 APPLY(3);
402 APPLY(4);
403 APPLY(5);
Kazuho Oku274a5722020-05-07 22:56:07 +0900404#undef APPLY
Kazuho Okuea21c502020-05-18 16:25:53 +0900405 goto ApplyEnd;
406 ApplyRemainder:
407 storen(dst, srclen, _mm_xor_si128(loadn(src, srclen), bits0));
408 dst = (__m128i *)((uint8_t *)dst + srclen);
409 srclen = 0;
410 ApplyEnd:;
Kazuho Oku274a5722020-05-07 22:56:07 +0900411 }
412 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900413
Kazuho Oku274a5722020-05-07 22:56:07 +0900414 /* next block AES starts here */
415 AESECB6_INIT();
416
417 AESECB6_UPDATE(1);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900418
419 /* setup gdata */
Kazuho Okufa13ede2020-05-06 16:19:57 +0900420 if (PTLS_UNLIKELY(aadlen != 0)) {
Kazuho Oku274a5722020-05-07 22:56:07 +0900421 gdata_cnt = 0;
422 while (gdata_cnt < 6) {
Kazuho Okufa13ede2020-05-06 16:19:57 +0900423 if (aadlen < 16) {
424 if (aadlen != 0) {
Kazuho Oku274a5722020-05-07 22:56:07 +0900425 gdatabuf[gdata_cnt++] = loadn(aad, aadlen);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900426 aadlen = 0;
427 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900428 goto GdataFillDST;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900429 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900430 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900431 aadlen -= 16;
432 }
433 gdata = gdatabuf;
Kazuho Oku274a5722020-05-07 22:56:07 +0900434 } else if (PTLS_LIKELY(dst_ghashlen >= 6 * 16)) {
Kazuho Okufa13ede2020-05-06 16:19:57 +0900435 gdata = dst_ghash;
Kazuho Oku274a5722020-05-07 22:56:07 +0900436 gdata_cnt = 6;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900437 dst_ghash += 6;
Kazuho Oku274a5722020-05-07 22:56:07 +0900438 dst_ghashlen -= 96;
439 } else {
440 gdata_cnt = 0;
441 GdataFillDST:
442 while (gdata_cnt < 6) {
443 if (dst_ghashlen < 16) {
444 if (dst_ghashlen != 0) {
445 gdatabuf[gdata_cnt++] = loadn(dst_ghash, dst_ghashlen);
446 dst_ghashlen = 0;
447 }
Kazuho Oku91c3b182020-05-10 05:25:48 +0900448 if (gdata_cnt < 6)
449 goto Finish;
Kazuho Oku274a5722020-05-07 22:56:07 +0900450 break;
451 }
452 gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++);
453 dst_ghashlen -= 16;
454 }
455 gdata = gdatabuf;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900456 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900457 }
458
Kazuho Oku91c3b182020-05-10 05:25:48 +0900459Finish:
460 gdatabuf[gdata_cnt++] = ac;
461
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900462 /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation.
Kazuho Oku91c3b182020-05-10 05:25:48 +0900463 * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM
464 * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead.
465 */
466 assert(STATE_EK0_READY());
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900467 for (size_t i = 0; i < gdata_cnt; ++i)
468 gfmul_onestep(&gstate, gdatabuf[i], --ghash_precompute);
469
470 _mm_storeu_si128(dst, gfmul_final(&gstate, ek0));
471
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900472 /* Finish the calculation of supplemental vector. Done at the very last, because the sample might cover the GCM tag. */
473 if ((state & STATE_SUPP_USED) != 0) {
474 size_t i;
475 if ((state & STATE_SUPP_IN_PROCESS) == 0) {
476 bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys;
477 bits4 = _mm_xor_si128(_mm_loadu_si128(supp->input), bits4keys[0]);
478 i = 1;
479 } else {
480 i = 2;
481 }
482 do {
483 bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]);
Kazuho Oku4c19f502020-05-15 08:30:35 +0900484 } while (i != ctx->ecb.rounds);
485 bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]);
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900486 _mm_storeu_si128((__m128i *)supp->output, bits4);
487 }
488
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900489#undef AESECB6_INIT
490#undef AESECB6_UPDATE
491#undef AESECB6_FINAL
492#undef STATE_EK0_BEEN_FOUND
493#undef STATE_EK0_READY
494#undef STATE_SUPP_IN_PROCESS
495}
496
Kazuho Okuba2b9602020-05-14 08:21:39 +0900497int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *ctx, void *output, const void *input, size_t inlen, __m128i ctr,
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900498 const void *_aad, size_t aadlen, const void *tag)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900499{
Kazuho Okuba2b9602020-05-14 08:21:39 +0900500 __m128i ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(),
501 bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128();
Kazuho Oku14c00c02020-09-12 20:48:25 +0900502 struct ptls_fusion_gfmul_state gstate = {0};
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900503 __m128i gdatabuf[6];
Kazuho Okuba2b9602020-05-14 08:21:39 +0900504 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), bswap8);
Kazuho Oku94feca22020-05-11 16:34:44 +0900505 struct ptls_fusion_aesgcm_ghash_precompute *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900506
507 const __m128i *gdata; // points to the elements fed into GHASH
508 size_t gdata_cnt;
509
Kazuho Oku94feca22020-05-11 16:34:44 +0900510 const __m128i *src_ghash = input, *src_aes = input, *aad = _aad;
511 __m128i *dst = output;
512 size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900513
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900514 /* schedule ek0 and suppkey */
Kazuho Okuba2b9602020-05-14 08:21:39 +0900515 ctr = _mm_add_epi64(ctr, one8);
516 bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap8), ctx->ecb.keys[0]);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900517 ++nondata_aes_cnt;
Kazuho Oku91c3b182020-05-10 05:25:48 +0900518
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900519#define STATE_IS_FIRST_RUN 0x1
520#define STATE_GHASH_HAS_MORE 0x2
521 int state = STATE_IS_FIRST_RUN | STATE_GHASH_HAS_MORE;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900522
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900523 /* the main loop */
524 while (1) {
525
526 /* setup gdata */
527 if (PTLS_UNLIKELY(aadlen != 0)) {
528 gdata = gdatabuf;
529 gdata_cnt = 0;
530 while (gdata_cnt < 6) {
531 if (aadlen < 16) {
532 if (aadlen != 0) {
533 gdatabuf[gdata_cnt++] = loadn(aad, aadlen);
534 aadlen = 0;
535 ++nondata_aes_cnt;
536 }
537 goto GdataFillSrc;
538 }
539 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
540 aadlen -= 16;
541 ++nondata_aes_cnt;
542 }
543 } else if (PTLS_LIKELY(src_ghashlen >= 6 * 16)) {
544 gdata = src_ghash;
545 gdata_cnt = 6;
546 src_ghash += 6;
547 src_ghashlen -= 6 * 16;
548 } else {
549 gdata = gdatabuf;
550 gdata_cnt = 0;
551 GdataFillSrc:
552 while (gdata_cnt < 6) {
553 if (src_ghashlen < 16) {
554 if (src_ghashlen != 0) {
555 gdatabuf[gdata_cnt++] = loadn(src_ghash, src_ghashlen);
Kazuho Oku94feca22020-05-11 16:34:44 +0900556 src_ghash = (__m128i *)((uint8_t *)src_ghash + src_ghashlen);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900557 src_ghashlen = 0;
558 }
Kazuho Oku94feca22020-05-11 16:34:44 +0900559 if (gdata_cnt < 6 && (state & STATE_GHASH_HAS_MORE) != 0) {
560 gdatabuf[gdata_cnt++] = ac;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900561 state &= ~STATE_GHASH_HAS_MORE;
562 }
563 break;
564 }
565 gdatabuf[gdata_cnt++] = _mm_loadu_si128(src_ghash++);
566 src_ghashlen -= 16;
567 }
568 }
569
570 /* setup aes bits */
571 if (PTLS_LIKELY(nondata_aes_cnt == 0))
572 goto InitAllBits;
573 switch (nondata_aes_cnt) {
Kazuho Oku66a95e52020-05-12 12:57:37 +0900574#define INIT_BITS(n, keys) \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900575 case n: \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900576 ctr = _mm_add_epi64(ctr, one8); \
577 bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, bswap8), keys[0]);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900578 InitAllBits:
Kazuho Oku66a95e52020-05-12 12:57:37 +0900579 INIT_BITS(0, ctx->ecb.keys);
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900580 INIT_BITS(1, ctx->ecb.keys);
Kazuho Oku66a95e52020-05-12 12:57:37 +0900581 INIT_BITS(2, ctx->ecb.keys);
582 INIT_BITS(3, ctx->ecb.keys);
583 INIT_BITS(4, ctx->ecb.keys);
584 INIT_BITS(5, ctx->ecb.keys);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900585#undef INIT_BITS
586 }
587
588 { /* run aes and ghash */
589#define AESECB6_UPDATE(i) \
590 do { \
591 __m128i k = ctx->ecb.keys[i]; \
592 bits0 = _mm_aesenc_si128(bits0, k); \
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900593 bits1 = _mm_aesenc_si128(bits1, k); \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900594 bits2 = _mm_aesenc_si128(bits2, k); \
595 bits3 = _mm_aesenc_si128(bits3, k); \
596 bits4 = _mm_aesenc_si128(bits4, k); \
597 bits5 = _mm_aesenc_si128(bits5, k); \
598 } while (0)
599
600 size_t aesi;
601 for (aesi = 1; aesi <= gdata_cnt; ++aesi) {
602 AESECB6_UPDATE(aesi);
Kazuho Oku48793862020-05-12 11:39:28 +0900603 gfmul_onestep(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900604 }
Kazuho Oku4c19f502020-05-15 08:30:35 +0900605 for (; aesi < ctx->ecb.rounds; ++aesi)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900606 AESECB6_UPDATE(aesi);
607 __m128i k = ctx->ecb.keys[aesi];
608 bits0 = _mm_aesenclast_si128(bits0, k);
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900609 bits1 = _mm_aesenclast_si128(bits1, k);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900610 bits2 = _mm_aesenclast_si128(bits2, k);
611 bits3 = _mm_aesenclast_si128(bits3, k);
612 bits4 = _mm_aesenclast_si128(bits4, k);
613 bits5 = _mm_aesenclast_si128(bits5, k);
614
615#undef AESECB6_UPDATE
616 }
617
618 /* apply aes bits */
619 if (PTLS_LIKELY(nondata_aes_cnt == 0 && src_aeslen >= 6 * 16)) {
620#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src_aes + i), bits##i))
621 APPLY(0);
622 APPLY(1);
623 APPLY(2);
624 APPLY(3);
625 APPLY(4);
626 APPLY(5);
627#undef APPLY
628 dst += 6;
629 src_aes += 6;
630 src_aeslen -= 6 * 16;
631 } else {
632 if ((state & STATE_IS_FIRST_RUN) != 0) {
633 ek0 = bits0;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900634 state &= ~STATE_IS_FIRST_RUN;
635 }
636 switch (nondata_aes_cnt) {
637#define APPLY(i) \
638 case i: \
Kazuho Oku94feca22020-05-11 16:34:44 +0900639 if (PTLS_LIKELY(src_aeslen > 16)) { \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900640 _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \
641 src_aeslen -= 16; \
642 } else { \
Kazuho Okuea21c502020-05-18 16:25:53 +0900643 bits0 = bits##i; \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900644 goto Finish; \
645 }
646 APPLY(0);
647 APPLY(1);
648 APPLY(2);
649 APPLY(3);
650 APPLY(4);
651 APPLY(5);
652#undef APPLY
653 }
654 nondata_aes_cnt = 0;
655 }
656 }
657
658Finish:
Kazuho Okuea21c502020-05-18 16:25:53 +0900659 if (src_aeslen == 16) {
660 _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits0));
661 } else if (src_aeslen != 0) {
662 storen(dst, src_aeslen, _mm_xor_si128(loadn(src_aes, src_aeslen), bits0));
663 }
664
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900665 assert((state & STATE_IS_FIRST_RUN) == 0);
666
667 /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */
668 if ((state & STATE_GHASH_HAS_MORE) != 0) {
669 assert(ghash_precompute - 1 == ctx->ghash);
670 gfmul_onestep(&gstate, ac, --ghash_precompute);
671 }
672
673 __m128i calctag = gfmul_final(&gstate, ek0);
674
675 return _mm_movemask_epi8(_mm_cmpeq_epi8(calctag, _mm_loadu_si128(tag))) == 0xffff;
676
677#undef STATE_IS_FIRST_RUN
678#undef STATE_GHASH_HAS_MORE
Kazuho Okufa13ede2020-05-06 16:19:57 +0900679}
680
Kazuho Oku4c19f502020-05-15 08:30:35 +0900681static __m128i expand_key(__m128i key, __m128i temp)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900682{
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900683 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
684 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
685 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
Kazuho Oku4c19f502020-05-15 08:30:35 +0900686
687 key = _mm_xor_si128(key, temp);
688
689 return key;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900690}
691
Kazuho Oku4c19f502020-05-15 08:30:35 +0900692void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size)
Kazuho Okue46529c2020-05-08 13:38:39 +0900693{
Kazuho Oku4c19f502020-05-15 08:30:35 +0900694 assert(is_enc && "decryption is not supported (yet)");
695
Kazuho Okue46529c2020-05-08 13:38:39 +0900696 size_t i = 0;
697
Kazuho Oku4c19f502020-05-15 08:30:35 +0900698 switch (key_size) {
699 case 16: /* AES128 */
700 ctx->rounds = 10;
701 break;
702 case 32: /* AES256 */
703 ctx->rounds = 14;
704 break;
705 default:
706 assert(!"invalid key size; AES128 / AES256 are supported");
707 break;
708 }
709
Kazuho Okue46529c2020-05-08 13:38:39 +0900710 ctx->keys[i++] = _mm_loadu_si128((__m128i *)key);
Kazuho Oku4c19f502020-05-15 08:30:35 +0900711 if (key_size == 32)
712 ctx->keys[i++] = _mm_loadu_si128((__m128i *)key + 1);
713
Kazuho Okue46529c2020-05-08 13:38:39 +0900714#define EXPAND(R) \
715 do { \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900716 ctx->keys[i] = expand_key(ctx->keys[i - key_size / 16], \
717 _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys[i - 1], R), _MM_SHUFFLE(3, 3, 3, 3))); \
718 if (i == ctx->rounds) \
719 goto Done; \
Kazuho Okue46529c2020-05-08 13:38:39 +0900720 ++i; \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900721 if (key_size > 24) { \
722 ctx->keys[i] = expand_key(ctx->keys[i - key_size / 16], \
723 _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys[i - 1], R), _MM_SHUFFLE(2, 2, 2, 2))); \
724 ++i; \
725 } \
Kazuho Okue46529c2020-05-08 13:38:39 +0900726 } while (0)
727 EXPAND(0x1);
728 EXPAND(0x2);
729 EXPAND(0x4);
730 EXPAND(0x8);
731 EXPAND(0x10);
732 EXPAND(0x20);
733 EXPAND(0x40);
734 EXPAND(0x80);
735 EXPAND(0x1b);
736 EXPAND(0x36);
737#undef EXPAND
Kazuho Oku4c19f502020-05-15 08:30:35 +0900738Done:
739 assert(i == ctx->rounds);
Kazuho Okue46529c2020-05-08 13:38:39 +0900740}
741
742void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx)
743{
744 ptls_clear_memory(ctx, sizeof(*ctx));
745}
746
Kazuho Oku4c19f502020-05-15 08:30:35 +0900747void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src)
748{
749 __m128i v = _mm_loadu_si128(src);
750 v = aesecb_encrypt(ctx, v);
751 _mm_storeu_si128(dst, v);
752}
753
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900754/**
755 * returns the number of ghash entries that is required to handle an AEAD block of given size
756 */
757static size_t aesgcm_calc_ghash_cnt(size_t capacity)
758{
759 // round-up by block size, add to handle worst split of the size between AAD and payload, plus context to hash AC
760 return (capacity + 15) / 16 + 2;
761}
762
763static void setup_one_ghash_entry(ptls_fusion_aesgcm_context_t *ctx)
764{
765 if (ctx->ghash_cnt != 0)
766 ctx->ghash[ctx->ghash_cnt].H = gfmul(ctx->ghash[ctx->ghash_cnt - 1].H, ctx->ghash[0].H);
767
768 __m128i r = _mm_shuffle_epi32(ctx->ghash[ctx->ghash_cnt].H, 78);
769 r = _mm_xor_si128(r, ctx->ghash[ctx->ghash_cnt].H);
770 ctx->ghash[ctx->ghash_cnt].r = r;
771
772 ++ctx->ghash_cnt;
773}
774
775ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900776{
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900777 ptls_fusion_aesgcm_context_t *ctx;
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900778 size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900779
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900780 if ((ctx = malloc(sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL)
781 return NULL;
782
Kazuho Oku4c19f502020-05-15 08:30:35 +0900783 ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900784
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900785 ctx->capacity = capacity;
786
Kazuho Oku9f2fb302020-05-11 13:13:26 +0900787 ctx->ghash[0].H = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128());
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900788 ctx->ghash[0].H = _mm_shuffle_epi8(ctx->ghash[0].H, bswap8);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900789 ctx->ghash[0].H = transformH(ctx->ghash[0].H);
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900790 ctx->ghash_cnt = 0;
791 while (ctx->ghash_cnt < ghash_cnt)
792 setup_one_ghash_entry(ctx);
793
794 return ctx;
795}
796
797ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity)
798{
799 size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity);
800
801 if (ghash_cnt <= ctx->ghash_cnt)
802 return ctx;
803
804 if ((ctx = realloc(ctx, sizeof(*ctx) + sizeof(ctx->ghash[0]) * ghash_cnt)) == NULL)
805 return NULL;
806
807 ctx->capacity = capacity;
808 while (ghash_cnt < ctx->ghash_cnt)
809 setup_one_ghash_entry(ctx);
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900810
811 return ctx;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900812}
813
Kazuho Oku31ebd7d2020-05-15 06:37:23 +0900814void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900815{
Kazuho Okue46529c2020-05-08 13:38:39 +0900816 ptls_clear_memory(ctx->ghash, sizeof(ctx->ghash[0]) * ctx->ghash_cnt);
817 ctx->ghash_cnt = 0;
818 ptls_fusion_aesecb_dispose(&ctx->ecb);
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900819 free(ctx);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900820}
821
Kazuho Oku9f2fb302020-05-11 13:13:26 +0900822static void ctr_dispose(ptls_cipher_context_t *_ctx)
823{
824 struct ctr_context *ctx = (struct ctr_context *)_ctx;
825 ptls_fusion_aesecb_dispose(&ctx->fusion);
826 _mm_storeu_si128(&ctx->bits, _mm_setzero_si128());
827}
828
829static void ctr_init(ptls_cipher_context_t *_ctx, const void *iv)
830{
831 struct ctr_context *ctx = (struct ctr_context *)_ctx;
832 _mm_storeu_si128(&ctx->bits, aesecb_encrypt(&ctx->fusion, _mm_loadu_si128(iv)));
833 ctx->is_ready = 1;
834}
835
836static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void *input, size_t len)
837{
838 struct ctr_context *ctx = (struct ctr_context *)_ctx;
839
840 assert((ctx->is_ready && len <= 16) ||
841 !"CTR transfomation is supported only once per call to `init` and the maximum size is limited to 16 bytes");
842 ctx->is_ready = 0;
843
844 if (len < 16) {
845 storen(output, len, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), loadn(input, len)));
846 } else {
847 _mm_storeu_si128(output, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), _mm_loadu_si128(input)));
848 }
849}
850
Kazuho Oku6b849782020-05-15 11:44:21 +0900851static int aesctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size)
Kazuho Oku9f2fb302020-05-11 13:13:26 +0900852{
853 struct ctr_context *ctx = (struct ctr_context *)_ctx;
854
855 ctx->super.do_dispose = ctr_dispose;
856 ctx->super.do_init = ctr_init;
857 ctx->super.do_transform = ctr_transform;
Kazuho Oku6b849782020-05-15 11:44:21 +0900858 ptls_fusion_aesecb_init(&ctx->fusion, 1, key, key_size);
Kazuho Oku9f2fb302020-05-11 13:13:26 +0900859 ctx->is_ready = 0;
860
861 return 0;
862}
863
Kazuho Oku6b849782020-05-15 11:44:21 +0900864static int aes128ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
865{
866 return aesctr_setup(ctx, is_enc, key, PTLS_AES128_KEY_SIZE);
867}
868
869static int aes256ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
870{
871 return aesctr_setup(ctx, is_enc, key, PTLS_AES256_KEY_SIZE);
872}
873
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900874static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx)
875{
876 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
877
Kazuho Oku31ebd7d2020-05-15 06:37:23 +0900878 ptls_fusion_aesgcm_free(ctx->aesgcm);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900879}
880
Kazuho Okuba2b9602020-05-14 08:21:39 +0900881static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900882{
Kazuho Okuba2b9602020-05-14 08:21:39 +0900883 __m128i ctr = _mm_setzero_si128();
884 ctr = _mm_insert_epi64(ctr, seq, 0);
Kazuho Oku076982f2020-05-14 09:28:44 +0900885 ctr = _mm_slli_si128(ctr, 4);
Kazuho Okuba2b9602020-05-14 08:21:39 +0900886 ctr = _mm_xor_si128(ctx->static_iv, ctr);
887 return ctr;
Kazuho Oku94feca22020-05-11 16:34:44 +0900888}
889
Kazuho Okuba2b9602020-05-14 08:21:39 +0900890void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
Kazuho Oku3a50ee12022-04-27 16:20:16 +0900891 ptls_iovec_t aad, ptls_aead_supplementary_encryption_t *supp)
Kazuho Okuba2b9602020-05-14 08:21:39 +0900892{
893 struct aesgcm_context *ctx = (void *)_ctx;
894
Kazuho Oku3a50ee12022-04-27 16:20:16 +0900895 if (inlen + aad.len > ctx->aesgcm->capacity)
896 ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, inlen + aad.len);
897 ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad.base, aad.len, supp);
898}
899
900static void aead_do_encrypt_v(struct st_ptls_aead_context_t *ctx, void *output, ptls_iovec_t *input, size_t incnt, uint64_t seq,
901 ptls_iovec_t aad)
902{
903 assert(!"FIXME");
Kazuho Okuba2b9602020-05-14 08:21:39 +0900904}
905
906static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
Kazuho Oku3a50ee12022-04-27 16:20:16 +0900907 ptls_iovec_t aad)
Kazuho Oku94feca22020-05-11 16:34:44 +0900908{
Kazuho Okuba2b9602020-05-14 08:21:39 +0900909 struct aesgcm_context *ctx = (void *)_ctx;
910
911 if (inlen < 16)
912 return SIZE_MAX;
Kazuho Oku94feca22020-05-11 16:34:44 +0900913
914 size_t enclen = inlen - 16;
Kazuho Oku3a50ee12022-04-27 16:20:16 +0900915 if (enclen + aad.len > ctx->aesgcm->capacity)
916 ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, enclen + aad.len);
917 if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad.base, aad.len,
Kazuho Okuba2b9602020-05-14 08:21:39 +0900918 (const uint8_t *)input + enclen))
Kazuho Oku94feca22020-05-11 16:34:44 +0900919 return SIZE_MAX;
920 return enclen;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900921}
922
Christian Huitema21c2d3e2020-12-06 16:48:12 -0800923static inline void aesgcm_xor_iv(ptls_aead_context_t *_ctx, const void *_bytes, size_t len)
Christian Huitema4f8c4852020-12-05 20:31:30 -0800924{
925 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
Christian Huitema4f8c4852020-12-05 20:31:30 -0800926 __m128i xor_mask = loadn(_bytes, len);
Christian Huitema71925d92020-12-05 21:36:00 -0800927 xor_mask = _mm_shuffle_epi8(xor_mask, bswap8);
Christian Huitema4f8c4852020-12-05 20:31:30 -0800928 ctx->static_iv = _mm_xor_si128(ctx->static_iv, xor_mask);
929}
930
Kazuho Oku6b849782020-05-15 11:44:21 +0900931static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900932{
933 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
934
Kazuho Okuba2b9602020-05-14 08:21:39 +0900935 ctx->static_iv = loadn(iv, PTLS_AESGCM_IV_SIZE);
936 ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, bswap8);
Kazuho Okuae2aeda2020-06-14 15:13:18 +0900937 if (key == NULL)
938 return 0;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900939
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900940 ctx->super.dispose_crypto = aesgcm_dispose_crypto;
Christian Huitema4a4bc222020-12-06 16:48:02 -0800941 ctx->super.do_xor_iv = aesgcm_xor_iv;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900942 ctx->super.do_encrypt = aead_do_encrypt;
Kazuho Oku3a50ee12022-04-27 16:20:16 +0900943 ctx->super.do_encrypt_v = aead_do_encrypt_v;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900944 ctx->super.do_decrypt = aead_do_decrypt;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900945
Kazuho Okuf950d652020-05-18 14:04:42 +0900946 ctx->aesgcm = ptls_fusion_aesgcm_new(key, key_size, 1500 /* assume ordinary packet size */);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900947
948 return 0;
949}
950
Kazuho Oku6b849782020-05-15 11:44:21 +0900951static int aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
952{
953 return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE);
954}
955
956static int aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
957{
958 return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE);
959}
960
Kazuho Oku9f2fb302020-05-11 13:13:26 +0900961ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR",
962 PTLS_AES128_KEY_SIZE,
963 1, // block size
964 PTLS_AES_IV_SIZE,
965 sizeof(struct ctr_context),
966 aes128ctr_setup};
Kazuho Oku6b849782020-05-15 11:44:21 +0900967ptls_cipher_algorithm_t ptls_fusion_aes256ctr = {"AES256-CTR",
968 PTLS_AES256_KEY_SIZE,
969 1, // block size
970 PTLS_AES_IV_SIZE,
971 sizeof(struct ctr_context),
972 aes256ctr_setup};
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900973ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM",
Christian Huitema11b75d52020-09-11 23:01:38 -0700974 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
975 PTLS_AESGCM_INTEGRITY_LIMIT,
Kazuho Oku9f2fb302020-05-11 13:13:26 +0900976 &ptls_fusion_aes128ctr,
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900977 NULL, // &ptls_fusion_aes128ecb,
978 PTLS_AES128_KEY_SIZE,
979 PTLS_AESGCM_IV_SIZE,
980 PTLS_AESGCM_TAG_SIZE,
981 sizeof(struct aesgcm_context),
Kazuho Oku9f2fb302020-05-11 13:13:26 +0900982 aes128gcm_setup};
Kazuho Oku6b849782020-05-15 11:44:21 +0900983ptls_aead_algorithm_t ptls_fusion_aes256gcm = {"AES256-GCM",
Christian Huitema11b75d52020-09-11 23:01:38 -0700984 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
985 PTLS_AESGCM_INTEGRITY_LIMIT,
Kazuho Oku6b849782020-05-15 11:44:21 +0900986 &ptls_fusion_aes256ctr,
987 NULL, // &ptls_fusion_aes256ecb,
988 PTLS_AES256_KEY_SIZE,
989 PTLS_AESGCM_IV_SIZE,
990 PTLS_AESGCM_TAG_SIZE,
991 sizeof(struct aesgcm_context),
992 aes256gcm_setup};
Kazuho Oku3ee790b2020-05-15 03:35:03 +0900993
Christian Huitema3c3e3f22020-06-23 15:49:15 -0700994#ifdef _WINDOWS
Christian Huitema81605432020-06-23 15:38:36 -0700995/**
996 * ptls_fusion_is_supported_by_cpu:
997 * Check that the CPU has extended instructions for PCMUL, AES and AVX2.
998 * This test assumes that the CPU is following the x86/x64 architecture.
999 * A slightly more refined test could check that the cpu_info spells out
1000 * "genuineIntel" or "authenticAMD", but would fail in presence of
1001 * little known CPU brands or some VM */
Christian Huitemac17ef182020-06-22 20:41:45 -07001002int ptls_fusion_is_supported_by_cpu(void)
1003{
Christian Huitema81605432020-06-23 15:38:36 -07001004 uint32_t cpu_info[4];
1005 uint32_t nb_ids;
1006 int is_supported = 0;
1007
1008 __cpuid(cpu_info, 0);
1009 nb_ids = cpu_info[0];
1010
1011 if (nb_ids >= 7) {
1012 uint32_t leaf1_ecx;
1013 __cpuid(cpu_info, 1);
1014 leaf1_ecx = cpu_info[2];
Kazuho Oku14c00c02020-09-12 20:48:25 +09001015
Christian Huitema81605432020-06-23 15:38:36 -07001016 if (/* PCLMUL */ (leaf1_ecx & (1 << 5)) != 0 && /* AES */ (leaf1_ecx & (1 << 25)) != 0) {
1017 uint32_t leaf7_ebx;
1018 __cpuid(cpu_info, 7);
1019 leaf7_ebx = cpu_info[1];
1020
1021 is_supported = /* AVX2 */ (leaf7_ebx & (1 << 5)) != 0;
1022 }
1023 }
1024
1025 return is_supported;
Christian Huitemac17ef182020-06-22 20:41:45 -07001026}
1027#else
Kazuho Oku3ee790b2020-05-15 03:35:03 +09001028int ptls_fusion_is_supported_by_cpu(void)
1029{
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001030 unsigned leaf1_ecx, leaf7_ebx;
1031
1032 { /* GCC-specific code to obtain CPU features */
Kazuho Okuefce0432020-05-15 04:51:58 +09001033 unsigned leaf_cnt;
1034 __asm__("cpuid" : "=a"(leaf_cnt) : "a"(0) : "ebx", "ecx", "edx");
1035 if (leaf_cnt < 7)
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001036 return 0;
Kazuho Okuefce0432020-05-15 04:51:58 +09001037 __asm__("cpuid" : "=c"(leaf1_ecx) : "a"(1) : "ebx", "edx");
1038 __asm__("cpuid" : "=b"(leaf7_ebx) : "a"(7), "c"(0) : "edx");
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001039 }
1040
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001041 /* AVX2 */
1042 if ((leaf7_ebx & (1 << 5)) == 0)
Kazuho Oku3ee790b2020-05-15 03:35:03 +09001043 return 0;
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001044 /* AES */
1045 if ((leaf1_ecx & (1 << 25)) == 0)
1046 return 0;
1047 /* PCLMUL */
1048 if ((leaf1_ecx & (1 << 1)) == 0)
1049 return 0;
1050
Kazuho Oku3ee790b2020-05-15 03:35:03 +09001051 return 1;
1052}
Christian Huitemac17ef182020-06-22 20:41:45 -07001053#endif