blob: 527becb78718604c96c0a9eed0279fa81358cf29 [file] [log] [blame]
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001/*
2 * This source file is licensed under the Apache License 2.0 *and* the MIT
3 * License. Please agree to *both* of the licensing terms!
4 *
5 *
6 * `transformH` function is a derivative work of OpenSSL. The original work
7 * is covered by the following license:
8 *
9 * Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
10 *
11 * Licensed under the Apache License 2.0 (the "License"). You may not use
12 * this file except in compliance with the License. You can obtain a copy
13 * in the file LICENSE in the source distribution or at
14 * https://www.openssl.org/source/license.html
15 *
16 *
17 * All other work, including modifications to the `transformH` function is
18 * covered by the following MIT license:
19 *
Kazuho Okud1a09122022-05-09 16:06:36 +090020 * Copyright (c) 2020-2022 Fastly, Kazuho Oku
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090021 *
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this software and associated documentation files (the "Software"), to
24 * deal in the Software without restriction, including without limitation the
25 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
26 * sell copies of the Software, and to permit persons to whom the Software is
27 * furnished to do so, subject to the following conditions:
28 *
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
31 *
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 * IN THE SOFTWARE.
39 */
40#include <stdint.h>
Kazuho Oku14c00c02020-09-12 20:48:25 +090041
Kazuho Okuf198c1b2020-05-08 00:45:29 +090042#include <stdlib.h>
Kazuho Okufa13ede2020-05-06 16:19:57 +090043#include <string.h>
Kazuho Oku3604f8b2020-05-15 04:24:27 +090044#include <immintrin.h>
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090045#include <tmmintrin.h>
Kazuho Oku02ca0f02020-05-13 20:46:39 +090046#include <nmmintrin.h>
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090047#include <wmmintrin.h>
48#include "picotls.h"
49#include "picotls/fusion.h"
50
Kazuho Oku8b9cd572022-05-04 13:44:33 +090051#if defined(__clang__)
52#if __has_feature(address_sanitizer)
53#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))
54#endif
55#elif __SANITIZE_ADDRESS__ /* gcc */
56#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
57#endif
58#ifndef NO_SANITIZE_ADDRESS
59#define NO_SANITIZE_ADDRESS
60#endif
61
Kazuho Okueb3ebf42022-05-10 07:06:17 +090062#ifdef _WINDOWS
63#define aligned_alloc(a, s) _aligned_malloc((s), (a))
64#endif
65
Kazuho Okuf198c1b2020-05-08 00:45:29 +090066struct ptls_fusion_aesgcm_context {
Kazuho Okue46529c2020-05-08 13:38:39 +090067 ptls_fusion_aesecb_context_t ecb;
Kazuho Oku7fd7c842020-05-18 14:04:42 +090068 size_t capacity;
Kazuho Okuf198c1b2020-05-08 00:45:29 +090069 size_t ghash_cnt;
Kazuho Oku07f37c22022-05-10 09:45:42 +090070};
71
72struct ptls_fusion_aesgcm_context128 {
73 struct ptls_fusion_aesgcm_context super;
74 struct ptls_fusion_aesgcm_ghash_precompute128 {
75 __m128i H;
76 __m128i r;
77 } ghash[0];
78};
79
80struct ptls_fusion_aesgcm_context256 {
81 struct ptls_fusion_aesgcm_context super;
82 union ptls_fusion_aesgcm_ghash_precompute256 {
83 struct {
84 __m128i H[2];
85 __m128i r[2];
86 };
87 struct {
88 __m256i Hx2;
89 __m256i rx2;
90 };
91 } ghash[0];
Kazuho Okuf198c1b2020-05-08 00:45:29 +090092};
93
Kazuho Oku1cf91f62020-05-13 15:11:14 +090094struct ctr_context {
95 ptls_cipher_context_t super;
96 ptls_fusion_aesecb_context_t fusion;
97 __m128i bits;
98 uint8_t is_ready;
99};
100
101struct aesgcm_context {
102 ptls_aead_context_t super;
103 ptls_fusion_aesgcm_context_t *aesgcm;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900104 /**
105 * retains the static IV in the upper 96 bits (in little endian)
106 */
107 __m128i static_iv;
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900108};
109
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900110static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000};
111#define poly (*(__m128i *)poly_)
Kazuho Oku680ce182022-05-09 15:53:18 +0900112static const uint8_t byteswap_[32] __attribute__((aligned(32))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
113 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
114#define byteswap128 (*(__m128i *)byteswap_)
115#define byteswap256 (*(__m256i *)byteswap_)
116static const uint8_t one_[16] __attribute__((aligned(16))) = {1};
117#define one8 (*(__m128i *)one_)
118static const uint8_t incr128x2_[32] __attribute__((aligned(32))) = {2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2};
119#define incr128x2 (*(__m256i *)incr128x2_)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900120
Kazuho Okuae95e4c2020-05-11 06:27:27 +0900121/* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl
122 * at commit 33388b4. */
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900123static __m128i transformH(__m128i H)
124{
125 // # <<1 twist
126 // pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
127 __m128i t2 = _mm_shuffle_epi32(H, 0xff);
128 // movdqa $Hkey,$T1
129 __m128i t1 = H;
130 // psllq \$1,$Hkey
131 H = _mm_slli_epi64(H, 1);
132 // pxor $T3,$T3 #
133 __m128i t3 = _mm_setzero_si128();
134 // psrlq \$63,$T1
135 t1 = _mm_srli_epi64(t1, 63);
136 // pcmpgtd $T2,$T3 # broadcast carry bit
137 t3 = _mm_cmplt_epi32(t2, t3);
138 // pslldq \$8,$T1
139 t1 = _mm_slli_si128(t1, 8);
140 // por $T1,$Hkey # H<<=1
141 H = _mm_or_si128(t1, H);
142
143 // # magic reduction
144 // pand .L0x1c2_polynomial(%rip),$T3
145 t3 = _mm_and_si128(t3, poly);
146 // pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
147 H = _mm_xor_si128(t3, H);
148
149 return H;
150}
151// end of Apache License code
152
153static __m128i gfmul(__m128i x, __m128i y)
154{
155 __m128i lo = _mm_clmulepi64_si128(x, y, 0x00);
156 __m128i hi = _mm_clmulepi64_si128(x, y, 0x11);
157
158 __m128i a = _mm_shuffle_epi32(x, 78);
159 __m128i b = _mm_shuffle_epi32(y, 78);
160 a = _mm_xor_si128(a, x);
161 b = _mm_xor_si128(b, y);
162
163 a = _mm_clmulepi64_si128(a, b, 0x00);
164 a = _mm_xor_si128(a, lo);
165 a = _mm_xor_si128(a, hi);
166
167 b = _mm_slli_si128(a, 8);
168 a = _mm_srli_si128(a, 8);
169
170 lo = _mm_xor_si128(lo, b);
171 hi = _mm_xor_si128(hi, a);
172
173 // from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
174 __m128i t = _mm_clmulepi64_si128(lo, poly, 0x10);
175 lo = _mm_shuffle_epi32(lo, 78);
176 lo = _mm_xor_si128(lo, t);
177 t = _mm_clmulepi64_si128(lo, poly, 0x10);
178 lo = _mm_shuffle_epi32(lo, 78);
179 lo = _mm_xor_si128(lo, t);
180
181 return _mm_xor_si128(hi, lo);
182}
183
Kazuho Oku680ce182022-05-09 15:53:18 +0900184static inline __m128i gfmul_do_reduce(__m128i hi, __m128i lo, __m128i mid)
185{
186 mid = _mm_xor_si128(mid, hi);
187 mid = _mm_xor_si128(mid, lo);
188 lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8));
189 hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8));
190
191 /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */
192 __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10);
193 lo = _mm_shuffle_epi32(lo, 78);
194 lo = _mm_xor_si128(lo, r);
195 r = _mm_clmulepi64_si128(lo, poly, 0x10);
196 lo = _mm_shuffle_epi32(lo, 78);
197 lo = _mm_xor_si128(lo, r);
198 lo = _mm_xor_si128(hi, lo);
199
200 return lo;
201}
202
203struct ptls_fusion_gfmul_state128 {
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900204 __m128i hi, lo, mid;
205};
206
Kazuho Oku680ce182022-05-09 15:53:18 +0900207static inline void gfmul_do_step128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
208 struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900209{
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900210 __m128i t = _mm_clmulepi64_si128(precompute->H, X, 0x00);
211 gstate->lo = _mm_xor_si128(gstate->lo, t);
212 t = _mm_clmulepi64_si128(precompute->H, X, 0x11);
213 gstate->hi = _mm_xor_si128(gstate->hi, t);
214 t = _mm_shuffle_epi32(X, 78);
215 t = _mm_xor_si128(t, X);
216 t = _mm_clmulepi64_si128(precompute->r, t, 0x00);
217 gstate->mid = _mm_xor_si128(gstate->mid, t);
218}
219
Kazuho Oku680ce182022-05-09 15:53:18 +0900220static inline void gfmul_firststep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
221 struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
Kazuho Okued661b12022-04-30 21:43:31 +0900222{
Kazuho Oku680ce182022-05-09 15:53:18 +0900223 X = _mm_shuffle_epi8(X, byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +0900224 X = _mm_xor_si128(gstate->lo, X);
225 gstate->lo = _mm_setzero_si128();
226 gstate->hi = _mm_setzero_si128();
227 gstate->mid = _mm_setzero_si128();
Kazuho Oku680ce182022-05-09 15:53:18 +0900228 gfmul_do_step128(gstate, X, precompute);
Kazuho Okued661b12022-04-30 21:43:31 +0900229}
230
Kazuho Oku680ce182022-05-09 15:53:18 +0900231static inline void gfmul_nextstep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
232 struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
Kazuho Okued661b12022-04-30 21:43:31 +0900233{
Kazuho Oku680ce182022-05-09 15:53:18 +0900234 X = _mm_shuffle_epi8(X, byteswap128);
235 gfmul_do_step128(gstate, X, precompute);
Kazuho Okued661b12022-04-30 21:43:31 +0900236}
237
Kazuho Oku680ce182022-05-09 15:53:18 +0900238static inline void gfmul_reduce128(struct ptls_fusion_gfmul_state128 *gstate)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900239{
Kazuho Oku680ce182022-05-09 15:53:18 +0900240 gstate->lo = gfmul_do_reduce(gstate->hi, gstate->lo, gstate->mid);
Kazuho Okued661b12022-04-30 21:43:31 +0900241}
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900242
Kazuho Oku680ce182022-05-09 15:53:18 +0900243static inline __m128i gfmul_get_tag128(struct ptls_fusion_gfmul_state128 *gstate, __m128i ek0)
Kazuho Okued661b12022-04-30 21:43:31 +0900244{
Kazuho Oku680ce182022-05-09 15:53:18 +0900245 __m128i tag = _mm_shuffle_epi8(gstate->lo, byteswap128);
246 tag = _mm_xor_si128(tag, ek0);
247 return tag;
248}
249
250struct ptls_fusion_gfmul_state256 {
251 __m256i hi, lo, mid;
252};
253
254static inline void gfmul_do_step256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X,
255 union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
256{
257 __m256i t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x00);
258 gstate->lo = _mm256_xor_si256(gstate->lo, t);
259 t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x11);
260 gstate->hi = _mm256_xor_si256(gstate->hi, t);
261 t = _mm256_shuffle_epi32(X, 78);
262 t = _mm256_xor_si256(t, X);
263 t = _mm256_clmulepi64_epi128(precompute->rx2, t, 0x00);
264 gstate->mid = _mm256_xor_si256(gstate->mid, t);
265}
266
267static inline void gfmul_firststep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, int half,
268 union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
269{
270 X = _mm256_shuffle_epi8(X, byteswap256);
271 X = _mm256_xor_si256(gstate->lo, X);
272 if (half)
273 X = _mm256_permute2f128_si256(X, X, 0x08);
274 gstate->lo = _mm256_setzero_si256();
275 gstate->hi = _mm256_setzero_si256();
276 gstate->mid = _mm256_setzero_si256();
277 gfmul_do_step256(gstate, X, precompute);
278}
279
280static inline void gfmul_nextstep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X,
281 union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
282{
283 X = _mm256_shuffle_epi8(X, byteswap256);
284 gfmul_do_step256(gstate, X, precompute);
285}
286
287static inline void gfmul_reduce256(struct ptls_fusion_gfmul_state256 *gstate)
288{
289#define XOR_256TO128(y) _mm_xor_si128(_mm256_castsi256_si128(y), _mm256_extractf128_si256((y), 1))
290 __m128i hi = XOR_256TO128(gstate->hi);
291 __m128i lo = XOR_256TO128(gstate->lo);
292 __m128i mid = XOR_256TO128(gstate->mid);
293#undef XOR_256TO128
294
295 lo = gfmul_do_reduce(hi, lo, mid);
296 gstate->lo = _mm256_castsi128_si256(lo);
297}
298
299static inline __m128i gfmul_get_tag256(struct ptls_fusion_gfmul_state256 *gstate, __m128i ek0)
300{
301 __m128i tag = _mm_shuffle_epi8(_mm256_castsi256_si128(gstate->lo), byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +0900302 tag = _mm_xor_si128(tag, ek0);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900303 return tag;
304}
305
Kazuho Oku94feca22020-05-11 16:34:44 +0900306static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v)
307{
Kazuho Oku680ce182022-05-09 15:53:18 +0900308#define ROUNDKEY(i) (ctx->avx256 ? _mm256_castsi256_si128(ctx->keys.m256[i]) : ctx->keys.m128[i])
Kazuho Oku94feca22020-05-11 16:34:44 +0900309
Kazuho Oku680ce182022-05-09 15:53:18 +0900310 v = _mm_xor_si128(v, ROUNDKEY(0));
311 for (size_t i = 1; i < ctx->rounds; ++i)
312 v = _mm_aesenc_si128(v, ROUNDKEY(i));
313 v = _mm_aesenclast_si128(v, ROUNDKEY(ctx->rounds));
Kazuho Oku94feca22020-05-11 16:34:44 +0900314
315 return v;
Kazuho Oku680ce182022-05-09 15:53:18 +0900316
317#undef ROUNDKEY
Kazuho Oku94feca22020-05-11 16:34:44 +0900318}
319
Kazuho Oku680ce182022-05-09 15:53:18 +0900320// 32-bytes of 0xff followed by 31-bytes of 0x00
321static const uint8_t loadn_mask[63] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
322 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
323 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900324static const uint8_t loadn_shuffle[31] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
325 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // first 16 bytes map to byte offsets
326 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
327 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // latter 15 bytes map to zero
328
Kazuho Oku8b9cd572022-05-04 13:44:33 +0900329NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +0900330static inline __m128i loadn_end_of_page(const void *p, size_t l)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900331{
Kazuho Oku680ce182022-05-09 15:53:18 +0900332 uintptr_t shift = (uintptr_t)p & 15;
333 __m128i pattern = _mm_loadu_si128((const __m128i *)(loadn_shuffle + shift));
334 return _mm_shuffle_epi8(_mm_load_si128((const __m128i *)((uintptr_t)p - shift)), pattern);
335}
336
Kazuho Oku196e4772022-05-10 09:52:07 +0900337NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +0900338static inline __m128i loadn128(const void *p, size_t l)
339{
340 __m128i v, mask = _mm_loadu_si128((__m128i *)(loadn_mask + 32 - l));
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900341 uintptr_t mod4k = (uintptr_t)p % 4096;
Goro Fuji9a99cf12021-11-30 12:07:24 +0000342
Kazuho Oku680ce182022-05-09 15:53:18 +0900343 if (PTLS_LIKELY(mod4k <= 4096 - 16) || mod4k + l > 4096) {
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900344 v = _mm_loadu_si128(p);
Kazuho Oku079b1d02020-05-14 02:24:28 +0900345 } else {
Kazuho Oku680ce182022-05-09 15:53:18 +0900346 v = loadn_end_of_page(p, l);
Kazuho Oku079b1d02020-05-14 02:24:28 +0900347 }
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900348 v = _mm_and_si128(v, mask);
Kazuho Oku680ce182022-05-09 15:53:18 +0900349
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900350 return v;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900351}
352
Kazuho Oku196e4772022-05-10 09:52:07 +0900353NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +0900354static inline __m256i loadn256(const void *p, size_t l)
355{
356 __m256i v, mask = _mm256_loadu_si256((__m256i *)(loadn_mask + 32 - l));
357 uintptr_t mod4k = (uintptr_t)p % 4096;
358
359 if (PTLS_LIKELY(mod4k < 4096 - 32) || mod4k + l > 4096) {
360 v = _mm256_loadu_si256(p);
361 } else if (l > 16) {
Kazuho Oku7f165e02022-05-10 07:04:06 +0900362 __m128i first16 = _mm_loadu_si128(p), second16 = loadn128((uint8_t *)p + 16, l - 16);
Kazuho Oku680ce182022-05-09 15:53:18 +0900363 v = _mm256_permute2f128_si256(_mm256_castsi128_si256(first16), _mm256_castsi128_si256(second16), 0x20);
364 } else if (l == 16) {
365 v = _mm256_castsi128_si256(_mm_loadu_si128(p));
366 } else {
367 v = _mm256_castsi128_si256(loadn_end_of_page(p, l));
368 }
369 v = _mm256_and_si256(v, mask);
370
371 return v;
372}
373
374static inline void storen128(void *_p, size_t l, __m128i v)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900375{
376 uint8_t buf[16], *p = _p;
377
378 *(__m128i *)buf = v;
379
380 for (size_t i = 0; i != l; ++i)
381 p[i] = buf[i];
382}
383
Kazuho Oku07f37c22022-05-10 09:45:42 +0900384void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr,
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900385 const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900386{
Kazuho Oku274a5722020-05-07 22:56:07 +0900387/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
388#define AESECB6_INIT() \
389 do { \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900390 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900391 bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900392 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900393 bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900394 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900395 bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900396 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900397 bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900398 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900399 bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900400 if (PTLS_LIKELY(srclen > 16 * 5)) { \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900401 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900402 bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900403 } else { \
404 if ((state & STATE_EK0_BEEN_FED) == 0) { \
405 bits5 = ek0; \
406 state |= STATE_EK0_BEEN_FED; \
407 } \
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900408 if ((state & STATE_SUPP_USED) != 0 && srclen <= 16 * 4 && (const __m128i *)supp->input + 1 <= dst_ghash) { \
409 bits4 = _mm_loadu_si128(supp->input); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900410 bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128; \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900411 state |= STATE_SUPP_IN_PROCESS; \
412 } \
Kazuho Oku274a5722020-05-07 22:56:07 +0900413 } \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900414 __m128i k = ctx->super.ecb.keys.m128[0]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900415 bits0 = _mm_xor_si128(bits0, k); \
416 bits1 = _mm_xor_si128(bits1, k); \
417 bits2 = _mm_xor_si128(bits2, k); \
418 bits3 = _mm_xor_si128(bits3, k); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900419 bits4 = _mm_xor_si128(bits4, bits4keys[0]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900420 bits5 = _mm_xor_si128(bits5, k); \
421 } while (0)
422
423/* aes block update */
424#define AESECB6_UPDATE(i) \
425 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900426 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900427 bits0 = _mm_aesenc_si128(bits0, k); \
428 bits1 = _mm_aesenc_si128(bits1, k); \
429 bits2 = _mm_aesenc_si128(bits2, k); \
430 bits3 = _mm_aesenc_si128(bits3, k); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900431 bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900432 bits5 = _mm_aesenc_si128(bits5, k); \
433 } while (0)
434
435/* aesenclast */
Kazuho Oku4c19f502020-05-15 08:30:35 +0900436#define AESECB6_FINAL(i) \
Kazuho Oku274a5722020-05-07 22:56:07 +0900437 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900438 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900439 bits0 = _mm_aesenclast_si128(bits0, k); \
440 bits1 = _mm_aesenclast_si128(bits1, k); \
441 bits2 = _mm_aesenclast_si128(bits2, k); \
442 bits3 = _mm_aesenclast_si128(bits3, k); \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900443 bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900444 bits5 = _mm_aesenclast_si128(bits5, k); \
445 } while (0)
446
Kazuho Oku07f37c22022-05-10 09:45:42 +0900447 struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900448 __m128i ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
Kazuho Oku07f37c22022-05-10 09:45:42 +0900449 const __m128i *bits4keys = ctx->super.ecb.keys.m128; /* is changed to supp->ctx.keys when calcurating suppout */
Kazuho Oku680ce182022-05-09 15:53:18 +0900450 struct ptls_fusion_gfmul_state128 gstate = {0};
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900451 __m128i gdatabuf[6];
Kazuho Oku680ce182022-05-09 15:53:18 +0900452 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900453
454 // src and dst are updated after the chunk is processed
Kazuho Oku94feca22020-05-11 16:34:44 +0900455 const __m128i *src = input;
456 __m128i *dst = output;
457 size_t srclen = inlen;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900458 // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor)
459 const __m128i *aad = _aad, *dst_ghash = dst;
Kazuho Oku274a5722020-05-07 22:56:07 +0900460 size_t dst_ghashlen = srclen;
461
Kazuho Oku07f37c22022-05-10 09:45:42 +0900462 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1;
Kazuho Oku274a5722020-05-07 22:56:07 +0900463
Kazuho Oku274a5722020-05-07 22:56:07 +0900464#define STATE_EK0_BEEN_FED 0x3
465#define STATE_EK0_INCOMPLETE 0x2
466#define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1)
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900467#define STATE_SUPP_USED 0x4
468#define STATE_SUPP_IN_PROCESS 0x8
469 int32_t state = supp != NULL ? STATE_SUPP_USED : 0;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900470
471 /* build counter */
Kazuho Okuba2b9602020-05-14 08:21:39 +0900472 ctr = _mm_insert_epi32(ctr, 1, 0);
Kazuho Oku680ce182022-05-09 15:53:18 +0900473 ek0 = _mm_shuffle_epi8(ctr, byteswap128);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900474
Kazuho Okud8dc6992020-05-19 14:02:21 +0900475 /* start preparing AES */
476 AESECB6_INIT();
477 AESECB6_UPDATE(1);
478
479 /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */
480 const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH
481 size_t gdata_cnt = 0;
482 if (PTLS_LIKELY(aadlen != 0)) {
483 while (gdata_cnt < 6) {
484 if (PTLS_LIKELY(aadlen < 16)) {
485 if (aadlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900486 gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
Kazuho Okud8dc6992020-05-19 14:02:21 +0900487 aadlen = 0;
488 }
489 goto MainLoop;
490 }
491 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
492 aadlen -= 16;
493 }
Kazuho Oku4c19f502020-05-15 08:30:35 +0900494 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900495
496 /* the main loop */
Kazuho Okud8dc6992020-05-19 14:02:21 +0900497MainLoop:
Kazuho Oku303153d2020-05-08 16:42:16 +0900498 while (1) {
Kazuho Okud8dc6992020-05-19 14:02:21 +0900499 /* run AES and multiplication in parallel */
500 size_t i;
501 for (i = 2; i < gdata_cnt + 2; ++i) {
502 AESECB6_UPDATE(i);
Kazuho Oku680ce182022-05-09 15:53:18 +0900503 gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
Kazuho Okud8dc6992020-05-19 14:02:21 +0900504 }
Kazuho Oku07f37c22022-05-10 09:45:42 +0900505 for (; i < ctx->super.ecb.rounds; ++i)
Kazuho Okud8dc6992020-05-19 14:02:21 +0900506 AESECB6_UPDATE(i);
507 AESECB6_FINAL(i);
508
Kazuho Oku274a5722020-05-07 22:56:07 +0900509 /* apply the bit stream to src and write to dest */
510 if (PTLS_LIKELY(srclen >= 6 * 16)) {
511#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i))
512 APPLY(0);
513 APPLY(1);
514 APPLY(2);
515 APPLY(3);
516 APPLY(4);
517 APPLY(5);
Kazuho Oku083f5312020-05-07 13:05:10 +0900518#undef APPLY
Kazuho Oku274a5722020-05-07 22:56:07 +0900519 dst += 6;
520 src += 6;
521 srclen -= 6 * 16;
522 } else {
523 if ((state & STATE_EK0_BEEN_FED) == STATE_EK0_BEEN_FED) {
524 ek0 = bits5;
525 state &= ~STATE_EK0_INCOMPLETE;
526 }
Kazuho Okua1a81e62020-05-09 03:46:46 +0900527 if ((state & STATE_SUPP_IN_PROCESS) != 0) {
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900528 _mm_storeu_si128((__m128i *)supp->output, bits4);
529 state &= ~(STATE_SUPP_USED | STATE_SUPP_IN_PROCESS);
Kazuho Okua1a81e62020-05-09 03:46:46 +0900530 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900531 if (srclen != 0) {
532#define APPLY(i) \
533 do { \
Kazuho Okuea21c502020-05-18 16:25:53 +0900534 if (PTLS_LIKELY(srclen >= 16)) { \
Kazuho Oku274a5722020-05-07 22:56:07 +0900535 _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \
536 srclen -= 16; \
Kazuho Okuea21c502020-05-18 16:25:53 +0900537 } else if (PTLS_LIKELY(srclen != 0)) { \
538 bits0 = bits##i; \
539 goto ApplyRemainder; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900540 } else { \
Kazuho Oku274a5722020-05-07 22:56:07 +0900541 goto ApplyEnd; \
542 } \
543 } while (0)
544 APPLY(0);
545 APPLY(1);
546 APPLY(2);
547 APPLY(3);
548 APPLY(4);
549 APPLY(5);
Kazuho Oku274a5722020-05-07 22:56:07 +0900550#undef APPLY
Kazuho Okuea21c502020-05-18 16:25:53 +0900551 goto ApplyEnd;
552 ApplyRemainder:
Kazuho Oku680ce182022-05-09 15:53:18 +0900553 storen128(dst, srclen, _mm_xor_si128(loadn128(src, srclen), bits0));
Kazuho Okuea21c502020-05-18 16:25:53 +0900554 dst = (__m128i *)((uint8_t *)dst + srclen);
555 srclen = 0;
556 ApplyEnd:;
Kazuho Oku274a5722020-05-07 22:56:07 +0900557 }
558 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900559
Kazuho Oku274a5722020-05-07 22:56:07 +0900560 /* next block AES starts here */
561 AESECB6_INIT();
562
563 AESECB6_UPDATE(1);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900564
565 /* setup gdata */
Kazuho Okufa13ede2020-05-06 16:19:57 +0900566 if (PTLS_UNLIKELY(aadlen != 0)) {
Kazuho Oku274a5722020-05-07 22:56:07 +0900567 gdata_cnt = 0;
568 while (gdata_cnt < 6) {
Kazuho Okufa13ede2020-05-06 16:19:57 +0900569 if (aadlen < 16) {
570 if (aadlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900571 gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900572 aadlen = 0;
573 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900574 goto GdataFillDST;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900575 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900576 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900577 aadlen -= 16;
578 }
579 gdata = gdatabuf;
Kazuho Oku274a5722020-05-07 22:56:07 +0900580 } else if (PTLS_LIKELY(dst_ghashlen >= 6 * 16)) {
Kazuho Okufa13ede2020-05-06 16:19:57 +0900581 gdata = dst_ghash;
Kazuho Oku274a5722020-05-07 22:56:07 +0900582 gdata_cnt = 6;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900583 dst_ghash += 6;
Kazuho Oku274a5722020-05-07 22:56:07 +0900584 dst_ghashlen -= 96;
585 } else {
586 gdata_cnt = 0;
587 GdataFillDST:
588 while (gdata_cnt < 6) {
589 if (dst_ghashlen < 16) {
590 if (dst_ghashlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900591 gdatabuf[gdata_cnt++] = loadn128(dst_ghash, dst_ghashlen);
Kazuho Oku274a5722020-05-07 22:56:07 +0900592 dst_ghashlen = 0;
593 }
Kazuho Oku91c3b182020-05-10 05:25:48 +0900594 if (gdata_cnt < 6)
595 goto Finish;
Kazuho Oku274a5722020-05-07 22:56:07 +0900596 break;
597 }
598 gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++);
599 dst_ghashlen -= 16;
600 }
601 gdata = gdatabuf;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900602 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900603 }
604
Kazuho Oku91c3b182020-05-10 05:25:48 +0900605Finish:
606 gdatabuf[gdata_cnt++] = ac;
607
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900608 /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation.
Kazuho Oku91c3b182020-05-10 05:25:48 +0900609 * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM
610 * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead.
611 */
612 assert(STATE_EK0_READY());
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900613 for (size_t i = 0; i < gdata_cnt; ++i)
Kazuho Oku680ce182022-05-09 15:53:18 +0900614 gfmul_nextstep128(&gstate, gdatabuf[i], --ghash_precompute);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900615
Kazuho Oku680ce182022-05-09 15:53:18 +0900616 gfmul_reduce128(&gstate);
617 _mm_storeu_si128(dst, gfmul_get_tag128(&gstate, ek0));
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900618
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900619 /* Finish the calculation of supplemental vector. Done at the very last, because the sample might cover the GCM tag. */
620 if ((state & STATE_SUPP_USED) != 0) {
621 size_t i;
622 if ((state & STATE_SUPP_IN_PROCESS) == 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900623 bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128;
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900624 bits4 = _mm_xor_si128(_mm_loadu_si128(supp->input), bits4keys[0]);
625 i = 1;
626 } else {
627 i = 2;
628 }
629 do {
630 bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900631 } while (i != ctx->super.ecb.rounds);
Kazuho Oku4c19f502020-05-15 08:30:35 +0900632 bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]);
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900633 _mm_storeu_si128((__m128i *)supp->output, bits4);
634 }
635
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900636#undef AESECB6_INIT
637#undef AESECB6_UPDATE
638#undef AESECB6_FINAL
639#undef STATE_EK0_BEEN_FOUND
640#undef STATE_EK0_READY
641#undef STATE_SUPP_IN_PROCESS
642}
643
Kazuho Oku07f37c22022-05-10 09:45:42 +0900644int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr,
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900645 const void *_aad, size_t aadlen, const void *tag)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900646{
Kazuho Oku07f37c22022-05-10 09:45:42 +0900647 struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900648 __m128i ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(),
649 bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128();
Kazuho Oku680ce182022-05-09 15:53:18 +0900650 struct ptls_fusion_gfmul_state128 gstate = {0};
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900651 __m128i gdatabuf[6];
Kazuho Oku680ce182022-05-09 15:53:18 +0900652 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900653 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900654
655 const __m128i *gdata; // points to the elements fed into GHASH
656 size_t gdata_cnt;
657
Kazuho Oku94feca22020-05-11 16:34:44 +0900658 const __m128i *src_ghash = input, *src_aes = input, *aad = _aad;
659 __m128i *dst = output;
660 size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900661
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900662 /* schedule ek0 and suppkey */
Kazuho Okuba2b9602020-05-14 08:21:39 +0900663 ctr = _mm_add_epi64(ctr, one8);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900664 bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), ctx->super.ecb.keys.m128[0]);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900665 ++nondata_aes_cnt;
Kazuho Oku91c3b182020-05-10 05:25:48 +0900666
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900667#define STATE_IS_FIRST_RUN 0x1
668#define STATE_GHASH_HAS_MORE 0x2
669 int state = STATE_IS_FIRST_RUN | STATE_GHASH_HAS_MORE;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900670
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900671 /* the main loop */
672 while (1) {
673
674 /* setup gdata */
675 if (PTLS_UNLIKELY(aadlen != 0)) {
676 gdata = gdatabuf;
677 gdata_cnt = 0;
678 while (gdata_cnt < 6) {
679 if (aadlen < 16) {
680 if (aadlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900681 gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900682 aadlen = 0;
683 ++nondata_aes_cnt;
684 }
685 goto GdataFillSrc;
686 }
687 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
688 aadlen -= 16;
689 ++nondata_aes_cnt;
690 }
691 } else if (PTLS_LIKELY(src_ghashlen >= 6 * 16)) {
692 gdata = src_ghash;
693 gdata_cnt = 6;
694 src_ghash += 6;
695 src_ghashlen -= 6 * 16;
696 } else {
697 gdata = gdatabuf;
698 gdata_cnt = 0;
699 GdataFillSrc:
700 while (gdata_cnt < 6) {
701 if (src_ghashlen < 16) {
702 if (src_ghashlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900703 gdatabuf[gdata_cnt++] = loadn128(src_ghash, src_ghashlen);
Kazuho Oku94feca22020-05-11 16:34:44 +0900704 src_ghash = (__m128i *)((uint8_t *)src_ghash + src_ghashlen);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900705 src_ghashlen = 0;
706 }
Kazuho Oku94feca22020-05-11 16:34:44 +0900707 if (gdata_cnt < 6 && (state & STATE_GHASH_HAS_MORE) != 0) {
708 gdatabuf[gdata_cnt++] = ac;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900709 state &= ~STATE_GHASH_HAS_MORE;
710 }
711 break;
712 }
713 gdatabuf[gdata_cnt++] = _mm_loadu_si128(src_ghash++);
714 src_ghashlen -= 16;
715 }
716 }
717
718 /* setup aes bits */
719 if (PTLS_LIKELY(nondata_aes_cnt == 0))
720 goto InitAllBits;
721 switch (nondata_aes_cnt) {
Kazuho Oku66a95e52020-05-12 12:57:37 +0900722#define INIT_BITS(n, keys) \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900723 case n: \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900724 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900725 bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), keys.m128[0]);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900726 InitAllBits:
Kazuho Oku07f37c22022-05-10 09:45:42 +0900727 INIT_BITS(0, ctx->super.ecb.keys);
728 INIT_BITS(1, ctx->super.ecb.keys);
729 INIT_BITS(2, ctx->super.ecb.keys);
730 INIT_BITS(3, ctx->super.ecb.keys);
731 INIT_BITS(4, ctx->super.ecb.keys);
732 INIT_BITS(5, ctx->super.ecb.keys);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900733#undef INIT_BITS
734 }
735
736 { /* run aes and ghash */
737#define AESECB6_UPDATE(i) \
738 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900739 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900740 bits0 = _mm_aesenc_si128(bits0, k); \
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900741 bits1 = _mm_aesenc_si128(bits1, k); \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900742 bits2 = _mm_aesenc_si128(bits2, k); \
743 bits3 = _mm_aesenc_si128(bits3, k); \
744 bits4 = _mm_aesenc_si128(bits4, k); \
745 bits5 = _mm_aesenc_si128(bits5, k); \
746 } while (0)
747
748 size_t aesi;
749 for (aesi = 1; aesi <= gdata_cnt; ++aesi) {
750 AESECB6_UPDATE(aesi);
Kazuho Oku680ce182022-05-09 15:53:18 +0900751 gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900752 }
Kazuho Oku07f37c22022-05-10 09:45:42 +0900753 for (; aesi < ctx->super.ecb.rounds; ++aesi)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900754 AESECB6_UPDATE(aesi);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900755 __m128i k = ctx->super.ecb.keys.m128[aesi];
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900756 bits0 = _mm_aesenclast_si128(bits0, k);
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900757 bits1 = _mm_aesenclast_si128(bits1, k);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900758 bits2 = _mm_aesenclast_si128(bits2, k);
759 bits3 = _mm_aesenclast_si128(bits3, k);
760 bits4 = _mm_aesenclast_si128(bits4, k);
761 bits5 = _mm_aesenclast_si128(bits5, k);
762
763#undef AESECB6_UPDATE
764 }
765
766 /* apply aes bits */
767 if (PTLS_LIKELY(nondata_aes_cnt == 0 && src_aeslen >= 6 * 16)) {
768#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src_aes + i), bits##i))
769 APPLY(0);
770 APPLY(1);
771 APPLY(2);
772 APPLY(3);
773 APPLY(4);
774 APPLY(5);
775#undef APPLY
776 dst += 6;
777 src_aes += 6;
778 src_aeslen -= 6 * 16;
779 } else {
780 if ((state & STATE_IS_FIRST_RUN) != 0) {
781 ek0 = bits0;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900782 state &= ~STATE_IS_FIRST_RUN;
783 }
784 switch (nondata_aes_cnt) {
785#define APPLY(i) \
786 case i: \
Kazuho Oku94feca22020-05-11 16:34:44 +0900787 if (PTLS_LIKELY(src_aeslen > 16)) { \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900788 _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \
789 src_aeslen -= 16; \
790 } else { \
Kazuho Okuea21c502020-05-18 16:25:53 +0900791 bits0 = bits##i; \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900792 goto Finish; \
793 }
794 APPLY(0);
795 APPLY(1);
796 APPLY(2);
797 APPLY(3);
798 APPLY(4);
799 APPLY(5);
800#undef APPLY
801 }
802 nondata_aes_cnt = 0;
803 }
804 }
805
806Finish:
Kazuho Okuea21c502020-05-18 16:25:53 +0900807 if (src_aeslen == 16) {
808 _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits0));
809 } else if (src_aeslen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900810 storen128(dst, src_aeslen, _mm_xor_si128(loadn128(src_aes, src_aeslen), bits0));
Kazuho Okuea21c502020-05-18 16:25:53 +0900811 }
812
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900813 assert((state & STATE_IS_FIRST_RUN) == 0);
814
815 /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */
816 if ((state & STATE_GHASH_HAS_MORE) != 0) {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900817 assert(ghash_precompute - 1 == ctx->ghash);
Kazuho Oku680ce182022-05-09 15:53:18 +0900818 gfmul_nextstep128(&gstate, ac, --ghash_precompute);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900819 }
820
Kazuho Oku680ce182022-05-09 15:53:18 +0900821 gfmul_reduce128(&gstate);
822 __m128i calctag = gfmul_get_tag128(&gstate, ek0);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900823
824 return _mm_movemask_epi8(_mm_cmpeq_epi8(calctag, _mm_loadu_si128(tag))) == 0xffff;
825
826#undef STATE_IS_FIRST_RUN
827#undef STATE_GHASH_HAS_MORE
Kazuho Okufa13ede2020-05-06 16:19:57 +0900828}
829
Kazuho Oku4c19f502020-05-15 08:30:35 +0900830static __m128i expand_key(__m128i key, __m128i temp)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900831{
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900832 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
833 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
834 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
Kazuho Oku4c19f502020-05-15 08:30:35 +0900835
836 key = _mm_xor_si128(key, temp);
837
838 return key;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900839}
840
Kazuho Oku680ce182022-05-09 15:53:18 +0900841void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size, int avx256)
Kazuho Okue46529c2020-05-08 13:38:39 +0900842{
Kazuho Oku4c19f502020-05-15 08:30:35 +0900843 assert(is_enc && "decryption is not supported (yet)");
844
Kazuho Okue46529c2020-05-08 13:38:39 +0900845 size_t i = 0;
846
Kazuho Oku4c19f502020-05-15 08:30:35 +0900847 switch (key_size) {
848 case 16: /* AES128 */
849 ctx->rounds = 10;
850 break;
851 case 32: /* AES256 */
852 ctx->rounds = 14;
853 break;
854 default:
855 assert(!"invalid key size; AES128 / AES256 are supported");
856 break;
857 }
Kazuho Oku680ce182022-05-09 15:53:18 +0900858 ctx->avx256 = avx256;
Kazuho Oku4c19f502020-05-15 08:30:35 +0900859
Kazuho Oku680ce182022-05-09 15:53:18 +0900860 /* load and expand keys using keys.m128 */
861 ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key);
Kazuho Oku4c19f502020-05-15 08:30:35 +0900862 if (key_size == 32)
Kazuho Oku680ce182022-05-09 15:53:18 +0900863 ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key + 1);
864 while (1) {
Kazuho Okue46529c2020-05-08 13:38:39 +0900865#define EXPAND(R) \
Kazuho Oku680ce182022-05-09 15:53:18 +0900866 { \
867 ctx->keys.m128[i] = \
868 expand_key(ctx->keys.m128[i - key_size / 16], \
869 _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(3, 3, 3, 3))); \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900870 if (i == ctx->rounds) \
Kazuho Oku680ce182022-05-09 15:53:18 +0900871 break; \
Kazuho Okue46529c2020-05-08 13:38:39 +0900872 ++i; \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900873 if (key_size > 24) { \
Kazuho Oku680ce182022-05-09 15:53:18 +0900874 ctx->keys.m128[i] = \
875 expand_key(ctx->keys.m128[i - key_size / 16], \
876 _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(2, 2, 2, 2))); \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900877 ++i; \
878 } \
Kazuho Oku680ce182022-05-09 15:53:18 +0900879 }
880 EXPAND(0x1);
881 EXPAND(0x2);
882 EXPAND(0x4);
883 EXPAND(0x8);
884 EXPAND(0x10);
885 EXPAND(0x20);
886 EXPAND(0x40);
887 EXPAND(0x80);
888 EXPAND(0x1b);
889 EXPAND(0x36);
Kazuho Okue46529c2020-05-08 13:38:39 +0900890#undef EXPAND
Kazuho Oku680ce182022-05-09 15:53:18 +0900891 }
892
893 /* convert to keys.m256 if avx256 is used */
894 if (ctx->avx256) {
895 size_t i = ctx->rounds;
896 do {
897 ctx->keys.m256[i] = _mm256_broadcastsi128_si256(ctx->keys.m128[i]);
898 } while (i-- != 0);
899 }
Kazuho Okue46529c2020-05-08 13:38:39 +0900900}
901
902void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx)
903{
904 ptls_clear_memory(ctx, sizeof(*ctx));
905}
906
Kazuho Oku4c19f502020-05-15 08:30:35 +0900907void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src)
908{
909 __m128i v = _mm_loadu_si128(src);
910 v = aesecb_encrypt(ctx, v);
911 _mm_storeu_si128(dst, v);
912}
913
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900914/**
915 * returns the number of ghash entries that is required to handle an AEAD block of given size
916 */
917static size_t aesgcm_calc_ghash_cnt(size_t capacity)
918{
919 // round-up by block size, add to handle worst split of the size between AAD and payload, plus context to hash AC
920 return (capacity + 15) / 16 + 2;
921}
922
923static void setup_one_ghash_entry(ptls_fusion_aesgcm_context_t *ctx)
924{
Kazuho Oku680ce182022-05-09 15:53:18 +0900925 __m128i *H, *r, *Hprev, H0;
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900926
Kazuho Oku680ce182022-05-09 15:53:18 +0900927 if (ctx->ecb.avx256) {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900928 struct ptls_fusion_aesgcm_context256 *ctx256 = (void *)ctx;
929#define GET_SLOT(i, mem) (&ctx256->ghash[(i) / 2].mem[(i) % 2 == 0])
Kazuho Oku680ce182022-05-09 15:53:18 +0900930 H = GET_SLOT(ctx->ghash_cnt, H);
931 r = GET_SLOT(ctx->ghash_cnt, r);
932 Hprev = ctx->ghash_cnt == 0 ? NULL : GET_SLOT(ctx->ghash_cnt - 1, H);
933#undef GET_SLOT
Kazuho Oku07f37c22022-05-10 09:45:42 +0900934 H0 = ctx256->ghash[0].H[1];
Kazuho Oku680ce182022-05-09 15:53:18 +0900935 } else {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900936 struct ptls_fusion_aesgcm_context128 *ctx128 = (void *)ctx;
937 H = &ctx128->ghash[ctx->ghash_cnt].H;
938 r = &ctx128->ghash[ctx->ghash_cnt].r;
939 Hprev = ctx->ghash_cnt == 0 ? NULL : &ctx128->ghash[ctx->ghash_cnt - 1].H;
940 H0 = ctx128->ghash[0].H;
Kazuho Oku680ce182022-05-09 15:53:18 +0900941 }
942
943 if (Hprev != NULL)
944 *H = gfmul(*Hprev, H0);
945
946 *r = _mm_shuffle_epi32(*H, 78);
947 *r = _mm_xor_si128(*r, *H);
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900948
949 ++ctx->ghash_cnt;
950}
951
Kazuho Oku07f37c22022-05-10 09:45:42 +0900952static size_t calc_aesgcm_context_size(size_t *ghash_cnt, int avx256)
953{
954 size_t sz;
955
956 if (avx256) {
957 if (*ghash_cnt % 2 != 0)
958 ++*ghash_cnt;
959 sz = offsetof(struct ptls_fusion_aesgcm_context256, ghash) +
960 sizeof(union ptls_fusion_aesgcm_ghash_precompute256) * *ghash_cnt / 2;
961 } else {
962 sz = offsetof(struct ptls_fusion_aesgcm_context128, ghash) +
963 sizeof(struct ptls_fusion_aesgcm_ghash_precompute128) * *ghash_cnt;
964 }
965 return sz;
966}
967
Kazuho Oku680ce182022-05-09 15:53:18 +0900968static ptls_fusion_aesgcm_context_t *new_aesgcm(const void *key, size_t key_size, size_t capacity, int avx256)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900969{
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900970 ptls_fusion_aesgcm_context_t *ctx;
Kazuho Oku07f37c22022-05-10 09:45:42 +0900971 size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity), ctx_size = calc_aesgcm_context_size(&ghash_cnt, avx256);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900972
Kazuho Oku07f37c22022-05-10 09:45:42 +0900973 if ((ctx = aligned_alloc(32, ctx_size)) == NULL)
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900974 return NULL;
975
Kazuho Oku680ce182022-05-09 15:53:18 +0900976 ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size, avx256);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900977
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900978 ctx->capacity = capacity;
979
Kazuho Oku680ce182022-05-09 15:53:18 +0900980 __m128i H0 = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128());
981 H0 = _mm_shuffle_epi8(H0, byteswap128);
982 H0 = transformH(H0);
983 if (ctx->ecb.avx256) {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900984 ((struct ptls_fusion_aesgcm_context256 *)ctx)->ghash[0].H[1] = H0;
Kazuho Oku680ce182022-05-09 15:53:18 +0900985 } else {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900986 ((struct ptls_fusion_aesgcm_context128 *)ctx)->ghash[0].H = H0;
Kazuho Oku680ce182022-05-09 15:53:18 +0900987 }
988
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900989 ctx->ghash_cnt = 0;
990 while (ctx->ghash_cnt < ghash_cnt)
991 setup_one_ghash_entry(ctx);
992
993 return ctx;
994}
995
Kazuho Oku680ce182022-05-09 15:53:18 +0900996ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity)
997{
998 return new_aesgcm(key, key_size, capacity, 0);
999}
1000
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001001ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity)
1002{
1003 size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity);
1004
1005 if (ghash_cnt <= ctx->ghash_cnt)
1006 return ctx;
1007
Kazuho Oku07f37c22022-05-10 09:45:42 +09001008 size_t ctx_size = calc_aesgcm_context_size(&ghash_cnt, ctx->ecb.avx256);
1009 if ((ctx = realloc(ctx, ctx_size)) == NULL)
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001010 return NULL;
1011
1012 ctx->capacity = capacity;
1013 while (ghash_cnt < ctx->ghash_cnt)
1014 setup_one_ghash_entry(ctx);
Kazuho Okuf198c1b2020-05-08 00:45:29 +09001015
1016 return ctx;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001017}
1018
Kazuho Oku31ebd7d2020-05-15 06:37:23 +09001019void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001020{
Kazuho Oku07f37c22022-05-10 09:45:42 +09001021 ptls_clear_memory(ctx, calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.avx256));
1022 /* skip ptls_fusion_aesecb_dispose, based on the knowledge that it does not allocate memory elsewhere */
1023
Kazuho Okuf198c1b2020-05-08 00:45:29 +09001024 free(ctx);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001025}
1026
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001027static void ctr_dispose(ptls_cipher_context_t *_ctx)
1028{
1029 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1030 ptls_fusion_aesecb_dispose(&ctx->fusion);
1031 _mm_storeu_si128(&ctx->bits, _mm_setzero_si128());
1032}
1033
1034static void ctr_init(ptls_cipher_context_t *_ctx, const void *iv)
1035{
1036 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1037 _mm_storeu_si128(&ctx->bits, aesecb_encrypt(&ctx->fusion, _mm_loadu_si128(iv)));
1038 ctx->is_ready = 1;
1039}
1040
1041static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void *input, size_t len)
1042{
1043 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1044
1045 assert((ctx->is_ready && len <= 16) ||
1046 !"CTR transfomation is supported only once per call to `init` and the maximum size is limited to 16 bytes");
1047 ctx->is_ready = 0;
1048
1049 if (len < 16) {
Kazuho Oku680ce182022-05-09 15:53:18 +09001050 storen128(output, len, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), loadn128(input, len)));
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001051 } else {
1052 _mm_storeu_si128(output, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), _mm_loadu_si128(input)));
1053 }
1054}
1055
Kazuho Oku6b849782020-05-15 11:44:21 +09001056static int aesctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size)
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001057{
1058 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1059
1060 ctx->super.do_dispose = ctr_dispose;
1061 ctx->super.do_init = ctr_init;
1062 ctx->super.do_transform = ctr_transform;
Kazuho Oku680ce182022-05-09 15:53:18 +09001063 ptls_fusion_aesecb_init(&ctx->fusion, 1, key, key_size, 0 /* probably we do not need avx256 for CTR? */);
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001064 ctx->is_ready = 0;
1065
1066 return 0;
1067}
1068
Kazuho Oku6b849782020-05-15 11:44:21 +09001069static int aes128ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
1070{
1071 return aesctr_setup(ctx, is_enc, key, PTLS_AES128_KEY_SIZE);
1072}
1073
1074static int aes256ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
1075{
1076 return aesctr_setup(ctx, is_enc, key, PTLS_AES256_KEY_SIZE);
1077}
1078
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001079static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx)
1080{
1081 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
1082
Kazuho Oku31ebd7d2020-05-15 06:37:23 +09001083 ptls_fusion_aesgcm_free(ctx->aesgcm);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001084}
1085
Kazuho Okuea42ef72022-05-02 07:40:01 +09001086static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen)
1087{
1088 assert(!"FIXME");
1089}
1090
1091static size_t aead_do_encrypt_update(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen)
1092{
1093 assert(!"FIXME");
1094 return SIZE_MAX;
1095}
1096
1097static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output)
1098{
1099 assert(!"FIXME");
1100 return SIZE_MAX;
1101}
1102
Kazuho Okuba2b9602020-05-14 08:21:39 +09001103static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001104{
Kazuho Okuba2b9602020-05-14 08:21:39 +09001105 __m128i ctr = _mm_setzero_si128();
1106 ctr = _mm_insert_epi64(ctr, seq, 0);
Kazuho Oku076982f2020-05-14 09:28:44 +09001107 ctr = _mm_slli_si128(ctr, 4);
Kazuho Okuba2b9602020-05-14 08:21:39 +09001108 ctr = _mm_xor_si128(ctx->static_iv, ctr);
1109 return ctr;
Kazuho Oku94feca22020-05-11 16:34:44 +09001110}
1111
Kazuho Okuba2b9602020-05-14 08:21:39 +09001112void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
Kazuho Okuea42ef72022-05-02 07:40:01 +09001113 const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp)
Kazuho Okuba2b9602020-05-14 08:21:39 +09001114{
1115 struct aesgcm_context *ctx = (void *)_ctx;
1116
Kazuho Okuea42ef72022-05-02 07:40:01 +09001117 if (inlen + aadlen > ctx->aesgcm->capacity)
1118 ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, inlen + aadlen);
1119 ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad, aadlen, supp);
Kazuho Oku3a50ee12022-04-27 16:20:16 +09001120}
1121
1122static void aead_do_encrypt_v(struct st_ptls_aead_context_t *ctx, void *output, ptls_iovec_t *input, size_t incnt, uint64_t seq,
Kazuho Okuea42ef72022-05-02 07:40:01 +09001123 const void *aad, size_t aadlen)
Kazuho Oku3a50ee12022-04-27 16:20:16 +09001124{
1125 assert(!"FIXME");
Kazuho Okuba2b9602020-05-14 08:21:39 +09001126}
1127
1128static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
Kazuho Okuea42ef72022-05-02 07:40:01 +09001129 const void *aad, size_t aadlen)
Kazuho Oku94feca22020-05-11 16:34:44 +09001130{
Kazuho Okuba2b9602020-05-14 08:21:39 +09001131 struct aesgcm_context *ctx = (void *)_ctx;
1132
1133 if (inlen < 16)
1134 return SIZE_MAX;
Kazuho Oku94feca22020-05-11 16:34:44 +09001135
1136 size_t enclen = inlen - 16;
Kazuho Okuea42ef72022-05-02 07:40:01 +09001137 if (enclen + aadlen > ctx->aesgcm->capacity)
1138 ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, enclen + aadlen);
1139 if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad, aadlen,
Kazuho Okuba2b9602020-05-14 08:21:39 +09001140 (const uint8_t *)input + enclen))
Kazuho Oku94feca22020-05-11 16:34:44 +09001141 return SIZE_MAX;
1142 return enclen;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001143}
1144
Christian Huitema21c2d3e2020-12-06 16:48:12 -08001145static inline void aesgcm_xor_iv(ptls_aead_context_t *_ctx, const void *_bytes, size_t len)
Christian Huitema4f8c4852020-12-05 20:31:30 -08001146{
1147 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
Kazuho Oku680ce182022-05-09 15:53:18 +09001148 __m128i xor_mask = loadn128(_bytes, len);
1149 xor_mask = _mm_shuffle_epi8(xor_mask, byteswap128);
Christian Huitema4f8c4852020-12-05 20:31:30 -08001150 ctx->static_iv = _mm_xor_si128(ctx->static_iv, xor_mask);
1151}
1152
Kazuho Oku6b849782020-05-15 11:44:21 +09001153static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001154{
1155 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
1156
Kazuho Oku680ce182022-05-09 15:53:18 +09001157 ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
1158 ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
Kazuho Okuae2aeda2020-06-14 15:13:18 +09001159 if (key == NULL)
1160 return 0;
Kazuho Okuba2b9602020-05-14 08:21:39 +09001161
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001162 ctx->super.dispose_crypto = aesgcm_dispose_crypto;
Christian Huitema4a4bc222020-12-06 16:48:02 -08001163 ctx->super.do_xor_iv = aesgcm_xor_iv;
Kazuho Okuea42ef72022-05-02 07:40:01 +09001164 ctx->super.do_encrypt_init = aead_do_encrypt_init;
1165 ctx->super.do_encrypt_update = aead_do_encrypt_update;
1166 ctx->super.do_encrypt_final = aead_do_encrypt_final;
Kazuho Okuba2b9602020-05-14 08:21:39 +09001167 ctx->super.do_encrypt = aead_do_encrypt;
Kazuho Oku3a50ee12022-04-27 16:20:16 +09001168 ctx->super.do_encrypt_v = aead_do_encrypt_v;
Kazuho Okuba2b9602020-05-14 08:21:39 +09001169 ctx->super.do_decrypt = aead_do_decrypt;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001170
Kazuho Oku680ce182022-05-09 15:53:18 +09001171 ctx->aesgcm = new_aesgcm(key, key_size, 1500 /* assume ordinary packet size */, 0 /* no support for avx256 yet */);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001172
1173 return 0;
1174}
1175
Kazuho Oku6b849782020-05-15 11:44:21 +09001176static int aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
1177{
1178 return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE);
1179}
1180
1181static int aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
1182{
1183 return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE);
1184}
1185
Kazuho Oku680ce182022-05-09 15:53:18 +09001186int ptls_fusion_can_avx256 = 0;
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001187ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR",
1188 PTLS_AES128_KEY_SIZE,
1189 1, // block size
1190 PTLS_AES_IV_SIZE,
1191 sizeof(struct ctr_context),
1192 aes128ctr_setup};
Kazuho Oku6b849782020-05-15 11:44:21 +09001193ptls_cipher_algorithm_t ptls_fusion_aes256ctr = {"AES256-CTR",
1194 PTLS_AES256_KEY_SIZE,
1195 1, // block size
1196 PTLS_AES_IV_SIZE,
1197 sizeof(struct ctr_context),
1198 aes256ctr_setup};
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001199ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM",
Christian Huitema11b75d52020-09-11 23:01:38 -07001200 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
1201 PTLS_AESGCM_INTEGRITY_LIMIT,
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001202 &ptls_fusion_aes128ctr,
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001203 NULL, // &ptls_fusion_aes128ecb,
1204 PTLS_AES128_KEY_SIZE,
1205 PTLS_AESGCM_IV_SIZE,
1206 PTLS_AESGCM_TAG_SIZE,
1207 sizeof(struct aesgcm_context),
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001208 aes128gcm_setup};
Kazuho Oku6b849782020-05-15 11:44:21 +09001209ptls_aead_algorithm_t ptls_fusion_aes256gcm = {"AES256-GCM",
Christian Huitema11b75d52020-09-11 23:01:38 -07001210 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
1211 PTLS_AESGCM_INTEGRITY_LIMIT,
Kazuho Oku6b849782020-05-15 11:44:21 +09001212 &ptls_fusion_aes256ctr,
1213 NULL, // &ptls_fusion_aes256ecb,
1214 PTLS_AES256_KEY_SIZE,
1215 PTLS_AESGCM_IV_SIZE,
1216 PTLS_AESGCM_TAG_SIZE,
1217 sizeof(struct aesgcm_context),
1218 aes256gcm_setup};
Kazuho Oku3ee790b2020-05-15 03:35:03 +09001219
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001220static inline size_t calc_total_length(ptls_iovec_t *input, size_t incnt)
1221{
1222 size_t totlen = 0;
1223 for (size_t i = 0; i < incnt; ++i)
1224 totlen += input[i].len;
1225 return totlen;
1226}
1227
Kazuho Okua7006dc2022-05-09 17:24:31 +09001228static inline void reduce_aad128(struct ptls_fusion_gfmul_state128 *gstate, struct ptls_fusion_aesgcm_ghash_precompute128 *ghash,
1229 const void *_aad, size_t aadlen)
1230{
1231 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute;
1232 const uint8_t *aad = _aad;
1233
1234 while (PTLS_UNLIKELY(aadlen >= 6 * 16)) {
1235 ghash_precompute = ghash + 6;
1236 gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1237 aad += 16;
1238 aadlen -= 16;
1239 for (int i = 1; i < 6; ++i) {
1240 gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1241 aad += 16;
1242 aadlen -= 16;
1243 }
1244 gfmul_reduce128(gstate);
1245 }
1246
1247 if (PTLS_LIKELY(aadlen != 0)) {
1248 ghash_precompute = ghash + (aadlen + 15) / 16;
1249 if (PTLS_UNLIKELY(aadlen >= 16)) {
1250 gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1251 aad += 16;
1252 aadlen -= 16;
1253 while (aadlen >= 16) {
1254 gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1255 aad += 16;
1256 aadlen -= 16;
1257 }
1258 if (PTLS_LIKELY(aadlen != 0))
1259 gfmul_nextstep128(gstate, loadn128(aad, aadlen), --ghash_precompute);
1260 } else {
1261 gfmul_firststep128(gstate, loadn128(aad, aadlen), --ghash_precompute);
1262 }
1263 assert(ghash == ghash_precompute);
1264 gfmul_reduce128(gstate);
1265 }
1266}
1267
Kazuho Oku13ced822022-05-10 10:09:54 +09001268NO_SANITIZE_ADDRESS
1269static inline uint8_t *load_preceding_unaligned(uint8_t *encbuf, uint8_t **output)
1270{
1271 uint8_t *encp;
1272
1273 if ((encp = encbuf + ((uintptr_t)*output & 63)) != encbuf) {
1274 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(*output - (encp - encbuf))));
1275 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(*output - (encp - encbuf) + 32)));
1276 *output -= encp - encbuf;
1277 }
1278
1279 return encp;
1280}
1281
1282NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +09001283static inline void write_remaining_bytes(uint8_t *dst, const uint8_t *src, const uint8_t *end)
1284{
1285 /* Write in 64-byte chunks, using NT store instructions. Last partial block, if any, is written to cache, as that cache line
1286 * would likely be read when the next TLS record is being built. */
1287
1288 for (; end - src >= 64; dst += 64, src += 64) {
1289 _mm256_stream_si256((void *)dst, _mm256_load_si256((void *)src));
1290 _mm256_stream_si256((void *)(dst + 32), _mm256_load_si256((void *)(src + 32)));
1291 }
1292 _mm_sfence(); /* weakly ordered writes have to be synced before being passed to NIC */
1293 if (src != end) {
1294 for (; end - src >= 16; dst += 16, src += 16)
1295 _mm_store_si128((void *)dst, _mm_load_si128((void *)src));
1296 if (src != end)
1297 storen128((void *)dst, end - src, loadn128((void *)src, end - src));
1298 }
1299}
1300
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001301static void non_temporal_encrypt_v128(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt,
Kazuho Okua7006dc2022-05-09 17:24:31 +09001302 uint64_t seq, const void *aad, size_t aadlen)
Kazuho Okued661b12022-04-30 21:43:31 +09001303{
1304/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
1305#define AESECB6_INIT() \
1306 do { \
1307 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001308 bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001309 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001310 bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001311 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001312 bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001313 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001314 bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001315 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001316 bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001317 if (PTLS_LIKELY(srclen > 16 * 5) || src_vecleft != 0) { \
Kazuho Okued661b12022-04-30 21:43:31 +09001318 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001319 bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001320 } else { \
Kazuho Oku4f6bcae2022-05-09 07:02:13 +09001321 bits5 = ek0; \
1322 state |= STATE_EK0_READY; \
Kazuho Okued661b12022-04-30 21:43:31 +09001323 } \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001324 __m128i k = ctx->super.ecb.keys.m128[0]; \
Kazuho Okued661b12022-04-30 21:43:31 +09001325 bits0 = _mm_xor_si128(bits0, k); \
1326 bits1 = _mm_xor_si128(bits1, k); \
1327 bits2 = _mm_xor_si128(bits2, k); \
1328 bits3 = _mm_xor_si128(bits3, k); \
1329 bits4 = _mm_xor_si128(bits4, k); \
1330 bits5 = _mm_xor_si128(bits5, k); \
1331 } while (0)
1332
1333/* aes block update */
1334#define AESECB6_UPDATE(i) \
1335 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001336 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Okued661b12022-04-30 21:43:31 +09001337 bits0 = _mm_aesenc_si128(bits0, k); \
1338 bits1 = _mm_aesenc_si128(bits1, k); \
1339 bits2 = _mm_aesenc_si128(bits2, k); \
1340 bits3 = _mm_aesenc_si128(bits3, k); \
1341 bits4 = _mm_aesenc_si128(bits4, k); \
1342 bits5 = _mm_aesenc_si128(bits5, k); \
1343 } while (0)
1344
1345/* aesenclast */
1346#define AESECB6_FINAL(i) \
1347 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001348 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Okued661b12022-04-30 21:43:31 +09001349 bits0 = _mm_aesenclast_si128(bits0, k); \
1350 bits1 = _mm_aesenclast_si128(bits1, k); \
1351 bits2 = _mm_aesenclast_si128(bits2, k); \
1352 bits3 = _mm_aesenclast_si128(bits3, k); \
1353 bits4 = _mm_aesenclast_si128(bits4, k); \
1354 bits5 = _mm_aesenclast_si128(bits5, k); \
1355 } while (0)
1356
1357 struct aesgcm_context *agctx = (void *)_ctx;
Kazuho Oku7a0685d2022-05-04 15:39:57 +09001358 uint8_t *output = _output;
Kazuho Okued661b12022-04-30 21:43:31 +09001359
Kazuho Okued661b12022-04-30 21:43:31 +09001360#define STATE_EK0_READY 0x1
Kazuho Okue0caecc2022-05-04 10:09:24 +09001361#define STATE_COPY_128B 0x2
Kazuho Okued661b12022-04-30 21:43:31 +09001362 int32_t state = 0;
1363
Kazuho Oku93b04e12022-05-09 23:51:18 +09001364 /* Bytes are written here first then written using NT store instructions, 64 bytes at a time.
1365 * Use of thread-local stroage is a hack. GCC spills a lot onto stack, and unless we move `encbuf` outside of stack, temporary
1366 * variables end up being stored onto somewhere not as fast as places near the stack pointer. This provides 18% speed
1367 * improvement on Core i5 9400, at the cost of 2% slowdown on Zen 3. */
1368#if defined(__GNUC__) && !defined(__clang__)
1369 static __thread
1370#endif
Kazuho Oku07f37c22022-05-10 09:45:42 +09001371 uint8_t encbuf[32 * 6] __attribute__((aligned(32))),
1372 *encp;
Kazuho Oku122a3342022-05-04 14:08:47 +09001373
1374 /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is
1375 * append to the ciphertext before writing the bytes to main memory using NT store instructions. */
1376 PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 16 + 16);
1377
Kazuho Oku13ced822022-05-10 10:09:54 +09001378 /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
1379 encp = load_preceding_unaligned(encbuf, &output);
1380
Kazuho Okue0caecc2022-05-04 10:09:24 +09001381 /* First write would be 128 bytes (32+6*16), if encbuf contains no less than 32 bytes already. */
1382 if (encp - encbuf >= 32)
1383 state |= STATE_COPY_128B;
Kazuho Okued661b12022-04-30 21:43:31 +09001384
1385 /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */
1386 __m128i ctr = calc_counter(agctx, seq);
1387 ctr = _mm_insert_epi32(ctr, 1, 0);
Kazuho Oku680ce182022-05-09 15:53:18 +09001388 __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128);
1389 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +09001390
Kazuho Oku07f37c22022-05-10 09:45:42 +09001391 struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm;
Kazuho Okued661b12022-04-30 21:43:31 +09001392 __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
Kazuho Oku680ce182022-05-09 15:53:18 +09001393 struct ptls_fusion_gfmul_state128 gstate = {0};
Kazuho Okued661b12022-04-30 21:43:31 +09001394
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001395 /* find the first non-empty vec */
1396 const uint8_t *src = NULL;
1397 size_t srclen = 0, src_vecleft = incnt;
1398 while (srclen == 0 && src_vecleft != 0) {
1399 src = (void *)input[0].base;
1400 srclen = input[0].len;
1401 ++input;
1402 --src_vecleft;
1403 }
1404
Kazuho Okued661b12022-04-30 21:43:31 +09001405 /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
1406 AESECB6_INIT();
1407 AESECB6_UPDATE(1);
1408 AESECB6_UPDATE(2);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001409 reduce_aad128(&gstate, ctx->ghash, aad, aadlen);
1410 for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
Kazuho Okued661b12022-04-30 21:43:31 +09001411 AESECB6_UPDATE(i);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001412 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Okued661b12022-04-30 21:43:31 +09001413
1414 /* Main loop. This loop:
1415 * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf,
1416 * 2. then if there is no more data to be encrypted, exit the loop, otherwise,
1417 * 3. calculate ghash of the blocks being written to encbuf,
1418 * 4. calculate next 6 * 16 bytes of keystream,
1419 * 5. writes encbuf in 64-byte blocks
1420 * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be
1421 * calculated. */
Kazuho Okue0caecc2022-05-04 10:09:24 +09001422 size_t remaining_ghash_from = encp - encbuf;
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001423 if (srclen != 0) {
Kazuho Okued661b12022-04-30 21:43:31 +09001424 while (1) {
1425 /* apply the bit stream to input, writing to encbuf */
1426 if (PTLS_LIKELY(srclen >= 6 * 16)) {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001427#define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(src + i * 16)), bits##i))
Kazuho Okued661b12022-04-30 21:43:31 +09001428 APPLY(0);
1429 APPLY(1);
1430 APPLY(2);
1431 APPLY(3);
1432 APPLY(4);
1433 APPLY(5);
1434#undef APPLY
Kazuho Okue0caecc2022-05-04 10:09:24 +09001435 encp += 6 * 16;
Kazuho Okued661b12022-04-30 21:43:31 +09001436 src += 6 * 16;
1437 srclen -= 6 * 16;
1438 if (PTLS_UNLIKELY(srclen == 0)) {
1439 if (src_vecleft == 0) {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001440 remaining_ghash_from = (encp - encbuf) - 96;
Kazuho Okued661b12022-04-30 21:43:31 +09001441 break;
1442 }
1443 src = (void *)input[0].base;
1444 srclen = input[0].len;
1445 ++input;
1446 --src_vecleft;
1447 }
1448 } else {
1449 /* slow path, load at most 6 * 16 bytes to encbuf then encrypt in-place */
1450 size_t bytes_copied = 0;
1451 do {
Kazuho Okufa3cd322022-05-08 17:05:17 +09001452 if (srclen >= 16 && bytes_copied < 5 * 16) {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001453 _mm_storeu_si128((void *)(encp + bytes_copied), _mm_loadu_si128((void *)src));
Kazuho Okued661b12022-04-30 21:43:31 +09001454 bytes_copied += 16;
1455 src += 16;
1456 srclen -= 16;
1457 } else {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001458 encp[bytes_copied++] = *src++;
Kazuho Okued661b12022-04-30 21:43:31 +09001459 --srclen;
1460 }
1461 if (PTLS_UNLIKELY(srclen == 0)) {
1462 if (src_vecleft == 0) {
1463 break;
1464 } else {
1465 src = (void *)input[0].base;
1466 srclen = input[0].len;
1467 ++input;
1468 --src_vecleft;
1469 }
1470 }
1471 } while (bytes_copied < 6 * 16);
Kazuho Okue0caecc2022-05-04 10:09:24 +09001472#define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(encp + i * 16)), bits##i))
Kazuho Okued661b12022-04-30 21:43:31 +09001473 APPLY(0);
1474 APPLY(1);
1475 APPLY(2);
1476 APPLY(3);
1477 APPLY(4);
1478 APPLY(5);
1479#undef APPLY
Kazuho Okue0caecc2022-05-04 10:09:24 +09001480 encp += bytes_copied;
Kazuho Okued661b12022-04-30 21:43:31 +09001481 if (PTLS_UNLIKELY(srclen == 0)) {
1482 /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it
1483 * will be fed into ghash. */
Kazuho Okue0caecc2022-05-04 10:09:24 +09001484 remaining_ghash_from = (encp - encbuf) - bytes_copied;
Kazuho Oku7fb163f2022-05-01 14:09:56 +09001485 if ((bytes_copied & 15) != 0)
Kazuho Okue0caecc2022-05-04 10:09:24 +09001486 _mm_storeu_si128((void *)encp, _mm_setzero_si128());
Kazuho Okued661b12022-04-30 21:43:31 +09001487 break;
1488 }
1489 }
1490
1491 /* Next 96-byte block starts here. Run AES and ghash in while writing output using non-temporal stores in 64-byte
1492 * blocks. */
1493 AESECB6_INIT();
Kazuho Oku07f37c22022-05-10 09:45:42 +09001494 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + 6;
Kazuho Oku680ce182022-05-09 15:53:18 +09001495 gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encp - 6 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001496 AESECB6_UPDATE(1);
Kazuho Oku680ce182022-05-09 15:53:18 +09001497 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 5 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001498 AESECB6_UPDATE(2);
Kazuho Oku680ce182022-05-09 15:53:18 +09001499 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 4 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001500 AESECB6_UPDATE(3);
Kazuho Oku7da09172022-04-30 23:54:59 +09001501 _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf));
1502 _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32)));
Kazuho Oku791036a2022-05-02 11:51:37 +09001503 AESECB6_UPDATE(4);
Kazuho Oku680ce182022-05-09 15:53:18 +09001504 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 3 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001505 AESECB6_UPDATE(5);
Kazuho Oku680ce182022-05-09 15:53:18 +09001506 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 2 * 16)), --ghash_precompute);
Kazuho Okub854db92022-04-30 23:44:34 +09001507 AESECB6_UPDATE(6);
Kazuho Oku680ce182022-05-09 15:53:18 +09001508 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 1 * 16)), --ghash_precompute);
Kazuho Okued661b12022-04-30 21:43:31 +09001509 AESECB6_UPDATE(7);
Kazuho Okue0caecc2022-05-04 10:09:24 +09001510 if ((state & STATE_COPY_128B) != 0) {
Kazuho Oku7da09172022-04-30 23:54:59 +09001511 _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64)));
1512 _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96)));
Kazuho Okued661b12022-04-30 21:43:31 +09001513 output += 128;
Kazuho Okue0caecc2022-05-04 10:09:24 +09001514 encp -= 128;
Kazuho Oku791036a2022-05-02 11:51:37 +09001515 AESECB6_UPDATE(8);
Kazuho Oku7a0685d2022-05-04 15:39:57 +09001516 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 128)));
1517 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 160)));
Kazuho Okued661b12022-04-30 21:43:31 +09001518 } else {
1519 output += 64;
Kazuho Okue0caecc2022-05-04 10:09:24 +09001520 encp -= 64;
Kazuho Oku7a0685d2022-05-04 15:39:57 +09001521 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 64)));
1522 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 96)));
Kazuho Oku791036a2022-05-02 11:51:37 +09001523 AESECB6_UPDATE(8);
Kazuho Okued661b12022-04-30 21:43:31 +09001524 }
Kazuho Okue0caecc2022-05-04 10:09:24 +09001525 state ^= STATE_COPY_128B;
Kazuho Oku791036a2022-05-02 11:51:37 +09001526 AESECB6_UPDATE(9);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001527 if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
1528 for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i)
Kazuho Oku59983e92022-05-02 12:04:47 +09001529 AESECB6_UPDATE(i);
1530 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001531 assert(ctx->ghash == ghash_precompute);
Kazuho Oku680ce182022-05-09 15:53:18 +09001532 gfmul_reduce128(&gstate);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001533 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Okued661b12022-04-30 21:43:31 +09001534 }
1535 }
1536
1537 /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */
1538
1539 { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7
1540 * blocks at once. */
Kazuho Okue0caecc2022-05-04 10:09:24 +09001541 size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16;
Kazuho Oku7fb163f2022-05-01 14:09:56 +09001542 _mm_storeu_si128((void *)(encbuf + ac_off), ac);
Kazuho Okue0caecc2022-05-04 10:09:24 +09001543 size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */
Kazuho Okued661b12022-04-30 21:43:31 +09001544 assert(blocks <= 7);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001545 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + blocks;
Kazuho Oku680ce182022-05-09 15:53:18 +09001546 gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
Kazuho Okued661b12022-04-30 21:43:31 +09001547 remaining_ghash_from += 16;
Kazuho Oku07f37c22022-05-10 09:45:42 +09001548 while (ghash_precompute != ctx->ghash) {
Kazuho Oku680ce182022-05-09 15:53:18 +09001549 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
Kazuho Okued661b12022-04-30 21:43:31 +09001550 remaining_ghash_from += 16;
1551 }
Kazuho Oku680ce182022-05-09 15:53:18 +09001552 gfmul_reduce128(&gstate);
Kazuho Okued661b12022-04-30 21:43:31 +09001553 }
1554
1555 /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */
1556 if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001557 bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]);
1558 for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
1559 bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]);
1560 bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]);
Kazuho Okued661b12022-04-30 21:43:31 +09001561 }
1562
1563 /* append tag to encbuf */
Kazuho Oku680ce182022-05-09 15:53:18 +09001564 _mm_storeu_si128((void *)encp, gfmul_get_tag128(&gstate, bits5));
Kazuho Okue0caecc2022-05-04 10:09:24 +09001565 encp += 16;
Kazuho Okued661b12022-04-30 21:43:31 +09001566
Kazuho Oku680ce182022-05-09 15:53:18 +09001567 /* write remaining bytes */
1568 write_remaining_bytes(output, encbuf, encp);
1569
1570#undef AESECB6_INIT
1571#undef AESECB6_UPDATE
1572#undef AESECB6_FINAL
1573#undef STATE_EK0_READY
1574#undef STATE_COPY_128B
1575}
1576
1577NO_SANITIZE_ADDRESS
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001578static void non_temporal_encrypt_v256(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt,
1579 uint64_t seq, const void *_aad, size_t aadlen)
Kazuho Oku680ce182022-05-09 15:53:18 +09001580{
1581/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
1582#define AESECB6_INIT() \
1583 do { \
1584 ctr = _mm256_add_epi64(ctr, incr128x2); \
1585 bits0 = _mm256_shuffle_epi8(ctr, byteswap256); \
1586 ctr = _mm256_add_epi64(ctr, incr128x2); \
1587 bits1 = _mm256_shuffle_epi8(ctr, byteswap256); \
1588 ctr = _mm256_add_epi64(ctr, incr128x2); \
1589 bits2 = _mm256_shuffle_epi8(ctr, byteswap256); \
1590 ctr = _mm256_add_epi64(ctr, incr128x2); \
1591 bits3 = _mm256_shuffle_epi8(ctr, byteswap256); \
1592 ctr = _mm256_add_epi64(ctr, incr128x2); \
1593 bits4 = _mm256_shuffle_epi8(ctr, byteswap256); \
1594 ctr = _mm256_add_epi64(ctr, incr128x2); \
1595 bits5 = _mm256_shuffle_epi8(ctr, byteswap256); \
1596 if (PTLS_UNLIKELY(srclen <= 32 * 6 - 16) && src_vecleft == 0) { \
Kazuho Okufd7d5c12022-05-09 21:37:38 +09001597 bits5 = _mm256_permute2f128_si256(bits5, ac_ek0, 0x30); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001598 state |= STATE_EK0_READY; \
1599 } \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001600 __m256i k = ctx->super.ecb.keys.m256[0]; \
Kazuho Oku680ce182022-05-09 15:53:18 +09001601 bits0 = _mm256_xor_si256(bits0, k); \
1602 bits1 = _mm256_xor_si256(bits1, k); \
1603 bits2 = _mm256_xor_si256(bits2, k); \
1604 bits3 = _mm256_xor_si256(bits3, k); \
1605 bits4 = _mm256_xor_si256(bits4, k); \
1606 bits5 = _mm256_xor_si256(bits5, k); \
1607 } while (0)
1608
1609/* aes block update */
1610#define AESECB6_UPDATE(i) \
1611 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001612 __m256i k = ctx->super.ecb.keys.m256[i]; \
Kazuho Oku680ce182022-05-09 15:53:18 +09001613 bits0 = _mm256_aesenc_epi128(bits0, k); \
1614 bits1 = _mm256_aesenc_epi128(bits1, k); \
1615 bits2 = _mm256_aesenc_epi128(bits2, k); \
1616 bits3 = _mm256_aesenc_epi128(bits3, k); \
1617 bits4 = _mm256_aesenc_epi128(bits4, k); \
1618 bits5 = _mm256_aesenc_epi128(bits5, k); \
1619 } while (0)
1620
1621/* aesenclast */
1622#define AESECB6_FINAL(i) \
1623 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001624 __m256i k = ctx->super.ecb.keys.m256[i]; \
Kazuho Oku680ce182022-05-09 15:53:18 +09001625 bits0 = _mm256_aesenclast_epi128(bits0, k); \
1626 bits1 = _mm256_aesenclast_epi128(bits1, k); \
1627 bits2 = _mm256_aesenclast_epi128(bits2, k); \
1628 bits3 = _mm256_aesenclast_epi128(bits3, k); \
1629 bits4 = _mm256_aesenclast_epi128(bits4, k); \
1630 bits5 = _mm256_aesenclast_epi128(bits5, k); \
1631 } while (0)
1632
1633 struct aesgcm_context *agctx = (void *)_ctx;
1634 uint8_t *output = _output;
1635 const uint8_t *aad = _aad;
1636
1637#define STATE_EK0_READY 0x1
1638 int32_t state = 0;
1639
1640 /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */
1641 uint8_t encbuf[32 * 9] __attribute__((aligned(32))), *encp;
1642
1643 /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is
1644 * append to the ciphertext before writing the bytes to main memory using NT store instructions. */
1645 PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 32 + 16);
1646
Kazuho Oku13ced822022-05-10 10:09:54 +09001647 /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
1648 encp = load_preceding_unaligned(encbuf, &output);
Kazuho Oku680ce182022-05-09 15:53:18 +09001649
1650 /* setup ctr, retaining Ek(0), len(A) | len(C) to be fed into GCM */
1651 __m256i ctr = _mm256_broadcastsi128_si256(calc_counter(agctx, seq));
1652 ctr = _mm256_insert_epi32(ctr, 1, 4);
Kazuho Okufd7d5c12022-05-09 21:37:38 +09001653 __m256i ac_ek0 = _mm256_permute2f128_si256(
1654 /* first half: ac */
1655 _mm256_castsi128_si256(
1656 _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128)),
1657 /* second half: ek0 */
1658 _mm256_shuffle_epi8(ctr, byteswap256), 0x30);
Kazuho Oku680ce182022-05-09 15:53:18 +09001659
Kazuho Oku07f37c22022-05-10 09:45:42 +09001660 struct ptls_fusion_aesgcm_context256 *ctx = (void *)agctx->aesgcm;
Kazuho Oku680ce182022-05-09 15:53:18 +09001661 __m256i bits0, bits1, bits2, bits3, bits4, bits5 = _mm256_setzero_si256();
1662 struct ptls_fusion_gfmul_state256 gstate = {0};
1663
1664 /* find the first non-empty vec */
1665 const uint8_t *src = NULL;
1666 size_t srclen = 0, src_vecleft = incnt;
1667 while (srclen == 0 && src_vecleft != 0) {
1668 src = (void *)input[0].base;
1669 srclen = input[0].len;
1670 ++input;
1671 --src_vecleft;
1672 }
1673
1674 /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
1675 AESECB6_INIT();
1676 AESECB6_UPDATE(1);
1677 AESECB6_UPDATE(2);
1678 if (PTLS_LIKELY(aadlen != 0)) {
1679 union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute;
1680 while (PTLS_UNLIKELY(aadlen >= 6 * 32)) {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001681 ghash_precompute = ctx->ghash + 6;
Kazuho Oku680ce182022-05-09 15:53:18 +09001682 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute);
1683 aad += 32;
1684 aadlen -= 32;
1685 for (int i = 1; i < 6; ++i) {
1686 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute);
1687 aad += 32;
1688 aadlen -= 32;
1689 }
1690 gfmul_reduce256(&gstate);
Kazuho Okued661b12022-04-30 21:43:31 +09001691 }
Kazuho Oku680ce182022-05-09 15:53:18 +09001692 if (PTLS_LIKELY(aadlen != 0)) {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001693 ghash_precompute = ctx->ghash + (aadlen + 31) / 32;
Kazuho Oku680ce182022-05-09 15:53:18 +09001694 if (PTLS_UNLIKELY(aadlen >= 32)) {
1695 if (aadlen % 32 == 0 || aadlen % 32 > 16) {
1696 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute);
1697 aad += 32;
1698 aadlen -= 32;
1699 } else {
1700 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 1, --ghash_precompute);
1701 aad += 16;
1702 aadlen -= 16;
1703 }
1704 while (aadlen >= 32) {
1705 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute);
1706 aad += 32;
1707 aadlen -= 32;
1708 }
1709 if (PTLS_LIKELY(aadlen != 0)) {
1710 assert(aadlen > 16);
1711 gfmul_nextstep256(&gstate, loadn256(aad, aadlen), --ghash_precompute);
1712 }
1713 } else {
1714 gfmul_firststep256(&gstate, loadn256(aad, aadlen), aadlen <= 16, --ghash_precompute);
1715 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001716 assert(ctx->ghash == ghash_precompute);
Kazuho Oku680ce182022-05-09 15:53:18 +09001717 gfmul_reduce256(&gstate);
Kazuho Okued661b12022-04-30 21:43:31 +09001718 }
1719 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001720 for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
Kazuho Oku680ce182022-05-09 15:53:18 +09001721 AESECB6_UPDATE(i);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001722 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Oku680ce182022-05-09 15:53:18 +09001723
1724 /* Main loop. This loop:
1725 * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf,
1726 * 2. then if there is no more data to be encrypted, exit the loop, otherwise,
1727 * 3. calculate ghash of the blocks being written to encbuf,
1728 * 4. calculate next 6 * 16 bytes of keystream,
1729 * 5. writes encbuf in 64-byte blocks
1730 * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be
1731 * calculated. */
1732 size_t remaining_ghash_from = encp - encbuf;
1733 if (srclen != 0) {
1734 while (1) {
1735 /* apply the bit stream to input, writing to encbuf */
1736 if (PTLS_LIKELY(srclen >= 6 * 32)) {
1737#define APPLY(i) _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(src + i * 32)), bits##i))
1738 APPLY(0);
1739 APPLY(1);
1740 APPLY(2);
1741 APPLY(3);
1742 APPLY(4);
1743 APPLY(5);
1744#undef APPLY
1745 encp += 6 * 32;
1746 src += 6 * 32;
1747 srclen -= 6 * 32;
1748 if (PTLS_UNLIKELY(srclen == 0)) {
1749 if (src_vecleft == 0) {
1750 remaining_ghash_from = (encp - encbuf) - 6 * 32;
1751 break;
1752 }
1753 src = (void *)input[0].base;
1754 srclen = input[0].len;
1755 ++input;
1756 --src_vecleft;
1757 }
1758 } else {
1759 /* slow path, load at most 6 * 32 bytes to encbuf then encrypt in-place */
1760 size_t bytes_copied = 0;
1761 do {
1762 if (srclen >= 32 && bytes_copied < 5 * 32) {
1763 _mm256_storeu_si256((void *)(encp + bytes_copied), _mm256_loadu_si256((void *)src));
1764 bytes_copied += 32;
1765 src += 32;
1766 srclen -= 32;
1767 } else {
1768 encp[bytes_copied++] = *src++;
1769 --srclen;
1770 }
1771 if (PTLS_UNLIKELY(srclen == 0)) {
1772 if (src_vecleft == 0) {
1773 break;
1774 } else {
1775 src = (void *)input[0].base;
1776 srclen = input[0].len;
1777 ++input;
1778 --src_vecleft;
1779 }
1780 }
1781 } while (bytes_copied < 6 * 32);
1782#define APPLY(i) \
1783 _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(encp + i * 32)), bits##i))
1784 APPLY(0);
1785 APPLY(1);
1786 APPLY(2);
1787 APPLY(3);
1788 APPLY(4);
1789 APPLY(5);
1790#undef APPLY
1791 encp += bytes_copied;
1792 if (PTLS_UNLIKELY(srclen == 0)) {
1793 /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it
1794 * will be fed into ghash. */
1795 remaining_ghash_from = (encp - encbuf) - bytes_copied;
1796 if ((bytes_copied & 15) != 0)
1797 _mm_storeu_si128((void *)encp, _mm_setzero_si128());
1798 break;
1799 }
1800 }
1801
1802 /* Next 96-byte block starts here. Run AES and ghash in parallel while writing output using non-temporal store
1803 * instructions. */
1804 AESECB6_INIT();
Kazuho Oku07f37c22022-05-10 09:45:42 +09001805 union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + 6;
Kazuho Oku680ce182022-05-09 15:53:18 +09001806 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encp - 6 * 32)), 0, --ghash_precompute);
1807 AESECB6_UPDATE(1);
1808 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 5 * 32)), --ghash_precompute);
1809 AESECB6_UPDATE(2);
1810 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 4 * 32)), --ghash_precompute);
1811 AESECB6_UPDATE(3);
1812 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 3 * 32)), --ghash_precompute);
1813 AESECB6_UPDATE(4);
1814 _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf));
1815 _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32)));
1816 _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64)));
1817 _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96)));
1818 _mm256_stream_si256((void *)(output + 128), _mm256_load_si256((void *)(encbuf + 128)));
1819 _mm256_stream_si256((void *)(output + 160), _mm256_load_si256((void *)(encbuf + 160)));
1820 AESECB6_UPDATE(5);
1821 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 2 * 32)), --ghash_precompute);
1822 AESECB6_UPDATE(6);
1823 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 1 * 32)), --ghash_precompute);
1824 output += 192;
1825 encp -= 192;
1826 AESECB6_UPDATE(7);
1827 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 192)));
1828 AESECB6_UPDATE(8);
1829 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 224)));
1830 AESECB6_UPDATE(9);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001831 if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
1832 for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i)
Kazuho Oku680ce182022-05-09 15:53:18 +09001833 AESECB6_UPDATE(i);
1834 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001835 assert(ctx->ghash == ghash_precompute);
Kazuho Oku680ce182022-05-09 15:53:18 +09001836 gfmul_reduce256(&gstate);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001837 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Oku680ce182022-05-09 15:53:18 +09001838 }
1839 }
1840
1841 /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */
1842
1843 { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7
1844 * blocks at once. */
1845 size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16;
Kazuho Okufd7d5c12022-05-09 21:37:38 +09001846 _mm_storeu_si128((void *)(encbuf + ac_off), _mm256_castsi256_si128(ac_ek0));
Kazuho Oku680ce182022-05-09 15:53:18 +09001847 size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */
1848 assert(blocks <= 13);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001849 union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + blocks / 2;
Kazuho Oku680ce182022-05-09 15:53:18 +09001850 if (blocks % 2 != 0) {
1851 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 1, ghash_precompute);
1852 remaining_ghash_from += 16;
1853 } else {
1854 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 0, --ghash_precompute);
1855 remaining_ghash_from += 32;
1856 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001857 while (ghash_precompute != ctx->ghash) {
Kazuho Oku680ce182022-05-09 15:53:18 +09001858 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
1859 remaining_ghash_from += 32;
1860 }
1861 gfmul_reduce256(&gstate);
1862 }
1863
1864 /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */
1865 if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) {
Kazuho Okufd7d5c12022-05-09 21:37:38 +09001866 bits5 = ac_ek0;
Kazuho Oku07f37c22022-05-10 09:45:42 +09001867 bits5 = _mm256_xor_si256(bits5, ctx->super.ecb.keys.m256[0]);
1868 for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
1869 bits5 = _mm256_aesenc_epi128(bits5, ctx->super.ecb.keys.m256[i]);
1870 bits5 = _mm256_aesenclast_epi128(bits5, ctx->super.ecb.keys.m256[ctx->super.ecb.rounds]);
Kazuho Oku680ce182022-05-09 15:53:18 +09001871 }
1872
1873 /* append tag to encbuf */
1874 _mm_storeu_si128((void *)encp,
1875 gfmul_get_tag256(&gstate, _mm256_castsi256_si128(_mm256_permute2f128_si256(bits5, bits5, 0x11))));
1876 encp += 16;
1877
1878 /* write remaining bytes */
1879 write_remaining_bytes(output, encbuf, encp);
Kazuho Okued661b12022-04-30 21:43:31 +09001880}
1881
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001882static int nt_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size)
Kazuho Okued661b12022-04-30 21:43:31 +09001883{
1884 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
1885
Kazuho Oku680ce182022-05-09 15:53:18 +09001886 ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
1887 ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +09001888 if (key == NULL)
1889 return 0;
1890
1891 ctx->super.dispose_crypto = aesgcm_dispose_crypto;
1892 ctx->super.do_xor_iv = aesgcm_xor_iv;
1893 ctx->super.do_encrypt = ptls_aead__do_encrypt;
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001894 ctx->super.do_encrypt_v = ptls_fusion_can_avx256 ? non_temporal_encrypt_v256 : non_temporal_encrypt_v128;
Kazuho Okued661b12022-04-30 21:43:31 +09001895 ctx->super.do_decrypt = NULL; /* FIXME */
1896
Kazuho Oku680ce182022-05-09 15:53:18 +09001897 ctx->aesgcm = new_aesgcm(key, key_size,
1898 7 * (ptls_fusion_can_avx256 ? 32 : 16), // 6 blocks at once, plus len(A) | len(C) that we might append
1899 ptls_fusion_can_avx256);
Kazuho Okued661b12022-04-30 21:43:31 +09001900
1901 return 0;
1902}
1903
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001904static int non_temporal_aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
Kazuho Okued661b12022-04-30 21:43:31 +09001905{
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001906 return nt_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE);
Kazuho Okued661b12022-04-30 21:43:31 +09001907}
1908
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001909static int non_temporal_aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
Kazuho Okued661b12022-04-30 21:43:31 +09001910{
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001911 return nt_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE);
Kazuho Okued661b12022-04-30 21:43:31 +09001912}
1913
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001914ptls_aead_algorithm_t ptls_non_temporal_aes128gcm = {"AES128-GCM",
1915 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
1916 PTLS_AESGCM_INTEGRITY_LIMIT,
1917 &ptls_fusion_aes128ctr,
1918 NULL, // &ptls_fusion_aes128ecb,
1919 PTLS_AES128_KEY_SIZE,
1920 PTLS_AESGCM_IV_SIZE,
1921 PTLS_AESGCM_TAG_SIZE,
1922 sizeof(struct aesgcm_context),
1923 non_temporal_aes128gcm_setup};
1924ptls_aead_algorithm_t ptls_non_temporal_aes256gcm = {"AES256-GCM",
1925 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
1926 PTLS_AESGCM_INTEGRITY_LIMIT,
1927 &ptls_fusion_aes256ctr,
1928 NULL, // &ptls_fusion_aes128ecb,
1929 PTLS_AES256_KEY_SIZE,
1930 PTLS_AESGCM_IV_SIZE,
1931 PTLS_AESGCM_TAG_SIZE,
1932 sizeof(struct aesgcm_context),
1933 non_temporal_aes256gcm_setup};
Kazuho Okued661b12022-04-30 21:43:31 +09001934
Christian Huitema3c3e3f22020-06-23 15:49:15 -07001935#ifdef _WINDOWS
Christian Huitema81605432020-06-23 15:38:36 -07001936/**
1937 * ptls_fusion_is_supported_by_cpu:
1938 * Check that the CPU has extended instructions for PCMUL, AES and AVX2.
1939 * This test assumes that the CPU is following the x86/x64 architecture.
1940 * A slightly more refined test could check that the cpu_info spells out
1941 * "genuineIntel" or "authenticAMD", but would fail in presence of
1942 * little known CPU brands or some VM */
Christian Huitemac17ef182020-06-22 20:41:45 -07001943int ptls_fusion_is_supported_by_cpu(void)
1944{
Christian Huitema81605432020-06-23 15:38:36 -07001945 uint32_t cpu_info[4];
1946 uint32_t nb_ids;
1947 int is_supported = 0;
1948
1949 __cpuid(cpu_info, 0);
1950 nb_ids = cpu_info[0];
1951
1952 if (nb_ids >= 7) {
1953 uint32_t leaf1_ecx;
1954 __cpuid(cpu_info, 1);
1955 leaf1_ecx = cpu_info[2];
Kazuho Oku14c00c02020-09-12 20:48:25 +09001956
Christian Huitema81605432020-06-23 15:38:36 -07001957 if (/* PCLMUL */ (leaf1_ecx & (1 << 5)) != 0 && /* AES */ (leaf1_ecx & (1 << 25)) != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +09001958 uint32_t leaf7_ebx, leaf7_ecx;
Christian Huitema81605432020-06-23 15:38:36 -07001959 __cpuid(cpu_info, 7);
1960 leaf7_ebx = cpu_info[1];
Kazuho Oku680ce182022-05-09 15:53:18 +09001961 leaf7_ecx = cpu_info[2];
Christian Huitema81605432020-06-23 15:38:36 -07001962
1963 is_supported = /* AVX2 */ (leaf7_ebx & (1 << 5)) != 0;
Kazuho Oku680ce182022-05-09 15:53:18 +09001964
1965 /* enable 256-bit mode if possible */
Kazuho Oku7f165e02022-05-10 07:04:06 +09001966 if (is_supported && (leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_avx256)
Kazuho Oku680ce182022-05-09 15:53:18 +09001967 ptls_fusion_can_avx256 = 1;
Christian Huitema81605432020-06-23 15:38:36 -07001968 }
1969 }
1970
1971 return is_supported;
Christian Huitemac17ef182020-06-22 20:41:45 -07001972}
1973#else
Kazuho Oku3ee790b2020-05-15 03:35:03 +09001974int ptls_fusion_is_supported_by_cpu(void)
1975{
Kazuho Oku680ce182022-05-09 15:53:18 +09001976 unsigned leaf1_ecx, leaf7_ebx, leaf7_ecx;
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001977
1978 { /* GCC-specific code to obtain CPU features */
Kazuho Okuefce0432020-05-15 04:51:58 +09001979 unsigned leaf_cnt;
1980 __asm__("cpuid" : "=a"(leaf_cnt) : "a"(0) : "ebx", "ecx", "edx");
1981 if (leaf_cnt < 7)
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001982 return 0;
Kazuho Okuefce0432020-05-15 04:51:58 +09001983 __asm__("cpuid" : "=c"(leaf1_ecx) : "a"(1) : "ebx", "edx");
Kazuho Oku680ce182022-05-09 15:53:18 +09001984 __asm__("cpuid" : "=b"(leaf7_ebx), "=c"(leaf7_ecx) : "a"(7), "c"(0) : "edx");
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001985 }
1986
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001987 /* AVX2 */
1988 if ((leaf7_ebx & (1 << 5)) == 0)
Kazuho Oku3ee790b2020-05-15 03:35:03 +09001989 return 0;
Kazuho Oku3604f8b2020-05-15 04:24:27 +09001990 /* AES */
1991 if ((leaf1_ecx & (1 << 25)) == 0)
1992 return 0;
1993 /* PCLMUL */
1994 if ((leaf1_ecx & (1 << 1)) == 0)
1995 return 0;
1996
Kazuho Oku680ce182022-05-09 15:53:18 +09001997 /* enable 256-bit mode if possible */
1998 if ((leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_avx256)
1999 ptls_fusion_can_avx256 = 1;
2000
Kazuho Oku3ee790b2020-05-15 03:35:03 +09002001 return 1;
2002}
Christian Huitemac17ef182020-06-22 20:41:45 -07002003#endif