blob: 35fe5795c8335be8038abc578d0a6dcb5c089493 [file] [log] [blame]
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001/*
2 * This source file is licensed under the Apache License 2.0 *and* the MIT
3 * License. Please agree to *both* of the licensing terms!
4 *
5 *
6 * `transformH` function is a derivative work of OpenSSL. The original work
7 * is covered by the following license:
8 *
9 * Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
10 *
11 * Licensed under the Apache License 2.0 (the "License"). You may not use
12 * this file except in compliance with the License. You can obtain a copy
13 * in the file LICENSE in the source distribution or at
14 * https://www.openssl.org/source/license.html
15 *
16 *
17 * All other work, including modifications to the `transformH` function is
18 * covered by the following MIT license:
19 *
Kazuho Okud1a09122022-05-09 16:06:36 +090020 * Copyright (c) 2020-2022 Fastly, Kazuho Oku
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090021 *
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this software and associated documentation files (the "Software"), to
24 * deal in the Software without restriction, including without limitation the
25 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
26 * sell copies of the Software, and to permit persons to whom the Software is
27 * furnished to do so, subject to the following conditions:
28 *
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
31 *
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 * IN THE SOFTWARE.
39 */
40#include <stdint.h>
Kazuho Oku14c00c02020-09-12 20:48:25 +090041
Kazuho Okuf198c1b2020-05-08 00:45:29 +090042#include <stdlib.h>
Kazuho Okufa13ede2020-05-06 16:19:57 +090043#include <string.h>
Kazuho Oku3604f8b2020-05-15 04:24:27 +090044#include <immintrin.h>
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090045#include <tmmintrin.h>
Kazuho Oku02ca0f02020-05-13 20:46:39 +090046#include <nmmintrin.h>
Kazuho Oku32f6c7b2020-05-05 22:14:41 +090047#include <wmmintrin.h>
48#include "picotls.h"
49#include "picotls/fusion.h"
50
Kazuho Oku8b9cd572022-05-04 13:44:33 +090051#if defined(__clang__)
52#if __has_feature(address_sanitizer)
53#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))
54#endif
55#elif __SANITIZE_ADDRESS__ /* gcc */
56#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
57#endif
58#ifndef NO_SANITIZE_ADDRESS
59#define NO_SANITIZE_ADDRESS
60#endif
61
Kazuho Okueb3ebf42022-05-10 07:06:17 +090062#ifdef _WINDOWS
63#define aligned_alloc(a, s) _aligned_malloc((s), (a))
Kazuho Okuba56a5d2022-12-12 09:24:11 +090064#define aligned_free(p) _aligned_free(p)
65#else
66#define aligned_free(p) free(p)
Kazuho Okueb3ebf42022-05-10 07:06:17 +090067#endif
68
Kazuho Okuf198c1b2020-05-08 00:45:29 +090069struct ptls_fusion_aesgcm_context {
Kazuho Okue46529c2020-05-08 13:38:39 +090070 ptls_fusion_aesecb_context_t ecb;
Kazuho Oku7fd7c842020-05-18 14:04:42 +090071 size_t capacity;
Kazuho Okuf198c1b2020-05-08 00:45:29 +090072 size_t ghash_cnt;
Kazuho Oku07f37c22022-05-10 09:45:42 +090073};
74
75struct ptls_fusion_aesgcm_context128 {
76 struct ptls_fusion_aesgcm_context super;
77 struct ptls_fusion_aesgcm_ghash_precompute128 {
78 __m128i H;
79 __m128i r;
80 } ghash[0];
81};
82
83struct ptls_fusion_aesgcm_context256 {
84 struct ptls_fusion_aesgcm_context super;
85 union ptls_fusion_aesgcm_ghash_precompute256 {
86 struct {
87 __m128i H[2];
88 __m128i r[2];
89 };
90 struct {
91 __m256i Hx2;
92 __m256i rx2;
93 };
94 } ghash[0];
Kazuho Okuf198c1b2020-05-08 00:45:29 +090095};
96
Kazuho Oku1cf91f62020-05-13 15:11:14 +090097struct ctr_context {
98 ptls_cipher_context_t super;
99 ptls_fusion_aesecb_context_t fusion;
100 __m128i bits;
101 uint8_t is_ready;
102};
103
104struct aesgcm_context {
105 ptls_aead_context_t super;
106 ptls_fusion_aesgcm_context_t *aesgcm;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900107 /**
108 * retains the static IV in the upper 96 bits (in little endian)
109 */
110 __m128i static_iv;
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900111};
112
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900113static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000};
114#define poly (*(__m128i *)poly_)
Kazuho Oku680ce182022-05-09 15:53:18 +0900115static const uint8_t byteswap_[32] __attribute__((aligned(32))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
116 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
117#define byteswap128 (*(__m128i *)byteswap_)
118#define byteswap256 (*(__m256i *)byteswap_)
119static const uint8_t one_[16] __attribute__((aligned(16))) = {1};
120#define one8 (*(__m128i *)one_)
121static const uint8_t incr128x2_[32] __attribute__((aligned(32))) = {2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2};
122#define incr128x2 (*(__m256i *)incr128x2_)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900123
Kazuho Okuae95e4c2020-05-11 06:27:27 +0900124/* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl
125 * at commit 33388b4. */
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900126static __m128i transformH(__m128i H)
127{
128 // # <<1 twist
129 // pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
130 __m128i t2 = _mm_shuffle_epi32(H, 0xff);
131 // movdqa $Hkey,$T1
132 __m128i t1 = H;
133 // psllq \$1,$Hkey
134 H = _mm_slli_epi64(H, 1);
135 // pxor $T3,$T3 #
136 __m128i t3 = _mm_setzero_si128();
137 // psrlq \$63,$T1
138 t1 = _mm_srli_epi64(t1, 63);
139 // pcmpgtd $T2,$T3 # broadcast carry bit
140 t3 = _mm_cmplt_epi32(t2, t3);
141 // pslldq \$8,$T1
142 t1 = _mm_slli_si128(t1, 8);
143 // por $T1,$Hkey # H<<=1
144 H = _mm_or_si128(t1, H);
145
146 // # magic reduction
147 // pand .L0x1c2_polynomial(%rip),$T3
148 t3 = _mm_and_si128(t3, poly);
149 // pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
150 H = _mm_xor_si128(t3, H);
151
152 return H;
153}
154// end of Apache License code
155
156static __m128i gfmul(__m128i x, __m128i y)
157{
158 __m128i lo = _mm_clmulepi64_si128(x, y, 0x00);
159 __m128i hi = _mm_clmulepi64_si128(x, y, 0x11);
160
161 __m128i a = _mm_shuffle_epi32(x, 78);
162 __m128i b = _mm_shuffle_epi32(y, 78);
163 a = _mm_xor_si128(a, x);
164 b = _mm_xor_si128(b, y);
165
166 a = _mm_clmulepi64_si128(a, b, 0x00);
167 a = _mm_xor_si128(a, lo);
168 a = _mm_xor_si128(a, hi);
169
170 b = _mm_slli_si128(a, 8);
171 a = _mm_srli_si128(a, 8);
172
173 lo = _mm_xor_si128(lo, b);
174 hi = _mm_xor_si128(hi, a);
175
176 // from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
177 __m128i t = _mm_clmulepi64_si128(lo, poly, 0x10);
178 lo = _mm_shuffle_epi32(lo, 78);
179 lo = _mm_xor_si128(lo, t);
180 t = _mm_clmulepi64_si128(lo, poly, 0x10);
181 lo = _mm_shuffle_epi32(lo, 78);
182 lo = _mm_xor_si128(lo, t);
183
184 return _mm_xor_si128(hi, lo);
185}
186
Kazuho Oku680ce182022-05-09 15:53:18 +0900187static inline __m128i gfmul_do_reduce(__m128i hi, __m128i lo, __m128i mid)
188{
189 mid = _mm_xor_si128(mid, hi);
190 mid = _mm_xor_si128(mid, lo);
191 lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8));
192 hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8));
193
194 /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */
195 __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10);
196 lo = _mm_shuffle_epi32(lo, 78);
197 lo = _mm_xor_si128(lo, r);
198 r = _mm_clmulepi64_si128(lo, poly, 0x10);
199 lo = _mm_shuffle_epi32(lo, 78);
200 lo = _mm_xor_si128(lo, r);
201 lo = _mm_xor_si128(hi, lo);
202
203 return lo;
204}
205
206struct ptls_fusion_gfmul_state128 {
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900207 __m128i hi, lo, mid;
208};
209
Kazuho Oku9f8e12a2022-05-11 08:21:45 +0900210#if defined(__GNUC__) && !defined(__clang__)
211static inline __m128i xor128(__m128i x, __m128i y)
212{
213 __m128i ret;
214 __asm__("vpxor %2, %1, %0" : "=x"(ret) : "x"(x), "xm"(y));
215 return ret;
216}
217#else
218#define xor128 _mm_xor_si128
219#endif
220
Kazuho Oku680ce182022-05-09 15:53:18 +0900221static inline void gfmul_do_step128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
222 struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900223{
Kazuho Oku9f8e12a2022-05-11 08:21:45 +0900224 __m128i t1 = _mm_clmulepi64_si128(precompute->H, X, 0x00);
225 __m128i t2 = _mm_clmulepi64_si128(precompute->H, X, 0x11);
226 __m128i t3 = _mm_shuffle_epi32(X, 78);
227 t3 = _mm_xor_si128(t3, X);
228 t3 = _mm_clmulepi64_si128(precompute->r, t3, 0x00);
229 gstate->lo = xor128(gstate->lo, t1);
230 gstate->hi = xor128(gstate->hi, t2);
231 gstate->mid = xor128(gstate->mid, t3);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900232}
233
Kazuho Oku9f8e12a2022-05-11 08:21:45 +0900234#undef xor128
235
Kazuho Oku680ce182022-05-09 15:53:18 +0900236static inline void gfmul_firststep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
237 struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
Kazuho Okued661b12022-04-30 21:43:31 +0900238{
Kazuho Oku680ce182022-05-09 15:53:18 +0900239 X = _mm_shuffle_epi8(X, byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +0900240 X = _mm_xor_si128(gstate->lo, X);
241 gstate->lo = _mm_setzero_si128();
242 gstate->hi = _mm_setzero_si128();
243 gstate->mid = _mm_setzero_si128();
Kazuho Oku680ce182022-05-09 15:53:18 +0900244 gfmul_do_step128(gstate, X, precompute);
Kazuho Okued661b12022-04-30 21:43:31 +0900245}
246
Kazuho Oku680ce182022-05-09 15:53:18 +0900247static inline void gfmul_nextstep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
248 struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
Kazuho Okued661b12022-04-30 21:43:31 +0900249{
Kazuho Oku680ce182022-05-09 15:53:18 +0900250 X = _mm_shuffle_epi8(X, byteswap128);
251 gfmul_do_step128(gstate, X, precompute);
Kazuho Okued661b12022-04-30 21:43:31 +0900252}
253
Kazuho Oku680ce182022-05-09 15:53:18 +0900254static inline void gfmul_reduce128(struct ptls_fusion_gfmul_state128 *gstate)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900255{
Kazuho Oku680ce182022-05-09 15:53:18 +0900256 gstate->lo = gfmul_do_reduce(gstate->hi, gstate->lo, gstate->mid);
Kazuho Okued661b12022-04-30 21:43:31 +0900257}
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900258
Kazuho Oku680ce182022-05-09 15:53:18 +0900259static inline __m128i gfmul_get_tag128(struct ptls_fusion_gfmul_state128 *gstate, __m128i ek0)
Kazuho Okued661b12022-04-30 21:43:31 +0900260{
Kazuho Oku680ce182022-05-09 15:53:18 +0900261 __m128i tag = _mm_shuffle_epi8(gstate->lo, byteswap128);
262 tag = _mm_xor_si128(tag, ek0);
263 return tag;
264}
265
266struct ptls_fusion_gfmul_state256 {
267 __m256i hi, lo, mid;
268};
269
270static inline void gfmul_do_step256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X,
271 union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
272{
273 __m256i t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x00);
274 gstate->lo = _mm256_xor_si256(gstate->lo, t);
275 t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x11);
276 gstate->hi = _mm256_xor_si256(gstate->hi, t);
277 t = _mm256_shuffle_epi32(X, 78);
278 t = _mm256_xor_si256(t, X);
279 t = _mm256_clmulepi64_epi128(precompute->rx2, t, 0x00);
280 gstate->mid = _mm256_xor_si256(gstate->mid, t);
281}
282
283static inline void gfmul_firststep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, int half,
284 union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
285{
286 X = _mm256_shuffle_epi8(X, byteswap256);
287 X = _mm256_xor_si256(gstate->lo, X);
288 if (half)
289 X = _mm256_permute2f128_si256(X, X, 0x08);
290 gstate->lo = _mm256_setzero_si256();
291 gstate->hi = _mm256_setzero_si256();
292 gstate->mid = _mm256_setzero_si256();
293 gfmul_do_step256(gstate, X, precompute);
294}
295
296static inline void gfmul_nextstep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X,
297 union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
298{
299 X = _mm256_shuffle_epi8(X, byteswap256);
300 gfmul_do_step256(gstate, X, precompute);
301}
302
303static inline void gfmul_reduce256(struct ptls_fusion_gfmul_state256 *gstate)
304{
305#define XOR_256TO128(y) _mm_xor_si128(_mm256_castsi256_si128(y), _mm256_extractf128_si256((y), 1))
306 __m128i hi = XOR_256TO128(gstate->hi);
307 __m128i lo = XOR_256TO128(gstate->lo);
308 __m128i mid = XOR_256TO128(gstate->mid);
309#undef XOR_256TO128
310
311 lo = gfmul_do_reduce(hi, lo, mid);
312 gstate->lo = _mm256_castsi128_si256(lo);
313}
314
315static inline __m128i gfmul_get_tag256(struct ptls_fusion_gfmul_state256 *gstate, __m128i ek0)
316{
317 __m128i tag = _mm_shuffle_epi8(_mm256_castsi256_si128(gstate->lo), byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +0900318 tag = _mm_xor_si128(tag, ek0);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900319 return tag;
320}
321
Kazuho Oku94feca22020-05-11 16:34:44 +0900322static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v)
323{
Kazuho Oku65d3e792022-06-29 12:04:02 +0900324#define ROUNDKEY(i) (ctx->aesni256 ? _mm256_castsi256_si128(ctx->keys.m256[i]) : ctx->keys.m128[i])
Kazuho Oku94feca22020-05-11 16:34:44 +0900325
Kazuho Oku680ce182022-05-09 15:53:18 +0900326 v = _mm_xor_si128(v, ROUNDKEY(0));
327 for (size_t i = 1; i < ctx->rounds; ++i)
328 v = _mm_aesenc_si128(v, ROUNDKEY(i));
329 v = _mm_aesenclast_si128(v, ROUNDKEY(ctx->rounds));
Kazuho Oku94feca22020-05-11 16:34:44 +0900330
331 return v;
Kazuho Oku680ce182022-05-09 15:53:18 +0900332
333#undef ROUNDKEY
Kazuho Oku94feca22020-05-11 16:34:44 +0900334}
335
Kazuho Oku680ce182022-05-09 15:53:18 +0900336// 32-bytes of 0xff followed by 31-bytes of 0x00
337static const uint8_t loadn_mask[63] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
338 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
339 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900340static const uint8_t loadn_shuffle[31] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
341 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // first 16 bytes map to byte offsets
342 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
343 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // latter 15 bytes map to zero
344
Kazuho Oku8b9cd572022-05-04 13:44:33 +0900345NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +0900346static inline __m128i loadn_end_of_page(const void *p, size_t l)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900347{
Kazuho Oku680ce182022-05-09 15:53:18 +0900348 uintptr_t shift = (uintptr_t)p & 15;
349 __m128i pattern = _mm_loadu_si128((const __m128i *)(loadn_shuffle + shift));
350 return _mm_shuffle_epi8(_mm_load_si128((const __m128i *)((uintptr_t)p - shift)), pattern);
351}
352
Kazuho Oku196e4772022-05-10 09:52:07 +0900353NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +0900354static inline __m128i loadn128(const void *p, size_t l)
355{
356 __m128i v, mask = _mm_loadu_si128((__m128i *)(loadn_mask + 32 - l));
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900357 uintptr_t mod4k = (uintptr_t)p % 4096;
Goro Fuji9a99cf12021-11-30 12:07:24 +0000358
Kazuho Oku680ce182022-05-09 15:53:18 +0900359 if (PTLS_LIKELY(mod4k <= 4096 - 16) || mod4k + l > 4096) {
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900360 v = _mm_loadu_si128(p);
Kazuho Oku079b1d02020-05-14 02:24:28 +0900361 } else {
Kazuho Oku680ce182022-05-09 15:53:18 +0900362 v = loadn_end_of_page(p, l);
Kazuho Oku079b1d02020-05-14 02:24:28 +0900363 }
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900364 v = _mm_and_si128(v, mask);
Kazuho Oku680ce182022-05-09 15:53:18 +0900365
MITSUNARI Shigeoeeff1642020-05-19 14:46:39 +0900366 return v;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900367}
368
Kazuho Oku196e4772022-05-10 09:52:07 +0900369NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +0900370static inline __m256i loadn256(const void *p, size_t l)
371{
372 __m256i v, mask = _mm256_loadu_si256((__m256i *)(loadn_mask + 32 - l));
373 uintptr_t mod4k = (uintptr_t)p % 4096;
374
375 if (PTLS_LIKELY(mod4k < 4096 - 32) || mod4k + l > 4096) {
376 v = _mm256_loadu_si256(p);
377 } else if (l > 16) {
Kazuho Oku7f165e02022-05-10 07:04:06 +0900378 __m128i first16 = _mm_loadu_si128(p), second16 = loadn128((uint8_t *)p + 16, l - 16);
Kazuho Oku680ce182022-05-09 15:53:18 +0900379 v = _mm256_permute2f128_si256(_mm256_castsi128_si256(first16), _mm256_castsi128_si256(second16), 0x20);
380 } else if (l == 16) {
381 v = _mm256_castsi128_si256(_mm_loadu_si128(p));
382 } else {
383 v = _mm256_castsi128_si256(loadn_end_of_page(p, l));
384 }
385 v = _mm256_and_si256(v, mask);
386
387 return v;
388}
389
390static inline void storen128(void *_p, size_t l, __m128i v)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900391{
392 uint8_t buf[16], *p = _p;
393
394 *(__m128i *)buf = v;
395
396 for (size_t i = 0; i != l; ++i)
397 p[i] = buf[i];
398}
399
Kazuho Oku07f37c22022-05-10 09:45:42 +0900400void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr,
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900401 const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp)
Kazuho Okufa13ede2020-05-06 16:19:57 +0900402{
Kazuho Oku274a5722020-05-07 22:56:07 +0900403/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
404#define AESECB6_INIT() \
405 do { \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900406 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900407 bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900408 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900409 bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900410 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900411 bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900412 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900413 bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900414 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900415 bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900416 if (PTLS_LIKELY(srclen > 16 * 5)) { \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900417 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900418 bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900419 } else { \
420 if ((state & STATE_EK0_BEEN_FED) == 0) { \
421 bits5 = ek0; \
422 state |= STATE_EK0_BEEN_FED; \
423 } \
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900424 if ((state & STATE_SUPP_USED) != 0 && srclen <= 16 * 4 && (const __m128i *)supp->input + 1 <= dst_ghash) { \
425 bits4 = _mm_loadu_si128(supp->input); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900426 bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128; \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900427 state |= STATE_SUPP_IN_PROCESS; \
428 } \
Kazuho Oku274a5722020-05-07 22:56:07 +0900429 } \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900430 __m128i k = ctx->super.ecb.keys.m128[0]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900431 bits0 = _mm_xor_si128(bits0, k); \
432 bits1 = _mm_xor_si128(bits1, k); \
433 bits2 = _mm_xor_si128(bits2, k); \
434 bits3 = _mm_xor_si128(bits3, k); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900435 bits4 = _mm_xor_si128(bits4, bits4keys[0]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900436 bits5 = _mm_xor_si128(bits5, k); \
437 } while (0)
438
439/* aes block update */
440#define AESECB6_UPDATE(i) \
441 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900442 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900443 bits0 = _mm_aesenc_si128(bits0, k); \
444 bits1 = _mm_aesenc_si128(bits1, k); \
445 bits2 = _mm_aesenc_si128(bits2, k); \
446 bits3 = _mm_aesenc_si128(bits3, k); \
Kazuho Okua1a81e62020-05-09 03:46:46 +0900447 bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900448 bits5 = _mm_aesenc_si128(bits5, k); \
449 } while (0)
450
451/* aesenclast */
Kazuho Oku4c19f502020-05-15 08:30:35 +0900452#define AESECB6_FINAL(i) \
Kazuho Oku274a5722020-05-07 22:56:07 +0900453 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900454 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900455 bits0 = _mm_aesenclast_si128(bits0, k); \
456 bits1 = _mm_aesenclast_si128(bits1, k); \
457 bits2 = _mm_aesenclast_si128(bits2, k); \
458 bits3 = _mm_aesenclast_si128(bits3, k); \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900459 bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); \
Kazuho Oku274a5722020-05-07 22:56:07 +0900460 bits5 = _mm_aesenclast_si128(bits5, k); \
461 } while (0)
462
Kazuho Oku07f37c22022-05-10 09:45:42 +0900463 struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900464 __m128i ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
Kazuho Oku07f37c22022-05-10 09:45:42 +0900465 const __m128i *bits4keys = ctx->super.ecb.keys.m128; /* is changed to supp->ctx.keys when calcurating suppout */
Kazuho Oku680ce182022-05-09 15:53:18 +0900466 struct ptls_fusion_gfmul_state128 gstate = {0};
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900467 __m128i gdatabuf[6];
Kazuho Oku680ce182022-05-09 15:53:18 +0900468 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900469
470 // src and dst are updated after the chunk is processed
Kazuho Oku94feca22020-05-11 16:34:44 +0900471 const __m128i *src = input;
472 __m128i *dst = output;
473 size_t srclen = inlen;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900474 // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor)
475 const __m128i *aad = _aad, *dst_ghash = dst;
Kazuho Oku274a5722020-05-07 22:56:07 +0900476 size_t dst_ghashlen = srclen;
477
Kazuho Oku07f37c22022-05-10 09:45:42 +0900478 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1;
Kazuho Oku274a5722020-05-07 22:56:07 +0900479
Kazuho Oku274a5722020-05-07 22:56:07 +0900480#define STATE_EK0_BEEN_FED 0x3
481#define STATE_EK0_INCOMPLETE 0x2
482#define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1)
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900483#define STATE_SUPP_USED 0x4
484#define STATE_SUPP_IN_PROCESS 0x8
485 int32_t state = supp != NULL ? STATE_SUPP_USED : 0;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900486
487 /* build counter */
Kazuho Okuba2b9602020-05-14 08:21:39 +0900488 ctr = _mm_insert_epi32(ctr, 1, 0);
Kazuho Oku680ce182022-05-09 15:53:18 +0900489 ek0 = _mm_shuffle_epi8(ctr, byteswap128);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900490
Kazuho Okud8dc6992020-05-19 14:02:21 +0900491 /* start preparing AES */
492 AESECB6_INIT();
493 AESECB6_UPDATE(1);
494
495 /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */
496 const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH
497 size_t gdata_cnt = 0;
498 if (PTLS_LIKELY(aadlen != 0)) {
499 while (gdata_cnt < 6) {
500 if (PTLS_LIKELY(aadlen < 16)) {
501 if (aadlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900502 gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
Kazuho Okud8dc6992020-05-19 14:02:21 +0900503 aadlen = 0;
504 }
505 goto MainLoop;
506 }
507 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
508 aadlen -= 16;
509 }
Kazuho Oku4c19f502020-05-15 08:30:35 +0900510 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900511
512 /* the main loop */
Kazuho Okud8dc6992020-05-19 14:02:21 +0900513MainLoop:
Kazuho Oku303153d2020-05-08 16:42:16 +0900514 while (1) {
Kazuho Okud8dc6992020-05-19 14:02:21 +0900515 /* run AES and multiplication in parallel */
516 size_t i;
517 for (i = 2; i < gdata_cnt + 2; ++i) {
518 AESECB6_UPDATE(i);
Kazuho Oku680ce182022-05-09 15:53:18 +0900519 gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
Kazuho Okud8dc6992020-05-19 14:02:21 +0900520 }
Kazuho Oku07f37c22022-05-10 09:45:42 +0900521 for (; i < ctx->super.ecb.rounds; ++i)
Kazuho Okud8dc6992020-05-19 14:02:21 +0900522 AESECB6_UPDATE(i);
523 AESECB6_FINAL(i);
524
Kazuho Oku274a5722020-05-07 22:56:07 +0900525 /* apply the bit stream to src and write to dest */
526 if (PTLS_LIKELY(srclen >= 6 * 16)) {
527#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i))
528 APPLY(0);
529 APPLY(1);
530 APPLY(2);
531 APPLY(3);
532 APPLY(4);
533 APPLY(5);
Kazuho Oku083f5312020-05-07 13:05:10 +0900534#undef APPLY
Kazuho Oku274a5722020-05-07 22:56:07 +0900535 dst += 6;
536 src += 6;
537 srclen -= 6 * 16;
538 } else {
539 if ((state & STATE_EK0_BEEN_FED) == STATE_EK0_BEEN_FED) {
540 ek0 = bits5;
541 state &= ~STATE_EK0_INCOMPLETE;
542 }
Kazuho Okua1a81e62020-05-09 03:46:46 +0900543 if ((state & STATE_SUPP_IN_PROCESS) != 0) {
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900544 _mm_storeu_si128((__m128i *)supp->output, bits4);
545 state &= ~(STATE_SUPP_USED | STATE_SUPP_IN_PROCESS);
Kazuho Okua1a81e62020-05-09 03:46:46 +0900546 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900547 if (srclen != 0) {
548#define APPLY(i) \
549 do { \
Kazuho Okuea21c502020-05-18 16:25:53 +0900550 if (PTLS_LIKELY(srclen >= 16)) { \
Kazuho Oku274a5722020-05-07 22:56:07 +0900551 _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \
552 srclen -= 16; \
Kazuho Okuea21c502020-05-18 16:25:53 +0900553 } else if (PTLS_LIKELY(srclen != 0)) { \
554 bits0 = bits##i; \
555 goto ApplyRemainder; \
Kazuho Oku274a5722020-05-07 22:56:07 +0900556 } else { \
Kazuho Oku274a5722020-05-07 22:56:07 +0900557 goto ApplyEnd; \
558 } \
559 } while (0)
560 APPLY(0);
561 APPLY(1);
562 APPLY(2);
563 APPLY(3);
564 APPLY(4);
565 APPLY(5);
Kazuho Oku274a5722020-05-07 22:56:07 +0900566#undef APPLY
Kazuho Okuea21c502020-05-18 16:25:53 +0900567 goto ApplyEnd;
568 ApplyRemainder:
Kazuho Oku680ce182022-05-09 15:53:18 +0900569 storen128(dst, srclen, _mm_xor_si128(loadn128(src, srclen), bits0));
Kazuho Okuea21c502020-05-18 16:25:53 +0900570 dst = (__m128i *)((uint8_t *)dst + srclen);
571 srclen = 0;
572 ApplyEnd:;
Kazuho Oku274a5722020-05-07 22:56:07 +0900573 }
574 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900575
Kazuho Oku274a5722020-05-07 22:56:07 +0900576 /* next block AES starts here */
577 AESECB6_INIT();
578
579 AESECB6_UPDATE(1);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900580
581 /* setup gdata */
Kazuho Okufa13ede2020-05-06 16:19:57 +0900582 if (PTLS_UNLIKELY(aadlen != 0)) {
Kazuho Oku274a5722020-05-07 22:56:07 +0900583 gdata_cnt = 0;
584 while (gdata_cnt < 6) {
Kazuho Okufa13ede2020-05-06 16:19:57 +0900585 if (aadlen < 16) {
586 if (aadlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900587 gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900588 aadlen = 0;
589 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900590 goto GdataFillDST;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900591 }
Kazuho Oku274a5722020-05-07 22:56:07 +0900592 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
Kazuho Okufa13ede2020-05-06 16:19:57 +0900593 aadlen -= 16;
594 }
595 gdata = gdatabuf;
Kazuho Oku274a5722020-05-07 22:56:07 +0900596 } else if (PTLS_LIKELY(dst_ghashlen >= 6 * 16)) {
Kazuho Okufa13ede2020-05-06 16:19:57 +0900597 gdata = dst_ghash;
Kazuho Oku274a5722020-05-07 22:56:07 +0900598 gdata_cnt = 6;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900599 dst_ghash += 6;
Kazuho Oku274a5722020-05-07 22:56:07 +0900600 dst_ghashlen -= 96;
601 } else {
602 gdata_cnt = 0;
603 GdataFillDST:
604 while (gdata_cnt < 6) {
605 if (dst_ghashlen < 16) {
606 if (dst_ghashlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900607 gdatabuf[gdata_cnt++] = loadn128(dst_ghash, dst_ghashlen);
Kazuho Oku274a5722020-05-07 22:56:07 +0900608 dst_ghashlen = 0;
609 }
Kazuho Oku91c3b182020-05-10 05:25:48 +0900610 if (gdata_cnt < 6)
611 goto Finish;
Kazuho Oku274a5722020-05-07 22:56:07 +0900612 break;
613 }
614 gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++);
615 dst_ghashlen -= 16;
616 }
617 gdata = gdatabuf;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900618 }
Kazuho Okufa13ede2020-05-06 16:19:57 +0900619 }
620
Kazuho Oku91c3b182020-05-10 05:25:48 +0900621Finish:
622 gdatabuf[gdata_cnt++] = ac;
623
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900624 /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation.
Kazuho Oku91c3b182020-05-10 05:25:48 +0900625 * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM
626 * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead.
627 */
628 assert(STATE_EK0_READY());
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900629 for (size_t i = 0; i < gdata_cnt; ++i)
Kazuho Oku680ce182022-05-09 15:53:18 +0900630 gfmul_nextstep128(&gstate, gdatabuf[i], --ghash_precompute);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900631
Kazuho Oku680ce182022-05-09 15:53:18 +0900632 gfmul_reduce128(&gstate);
633 _mm_storeu_si128(dst, gfmul_get_tag128(&gstate, ek0));
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900634
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900635 /* Finish the calculation of supplemental vector. Done at the very last, because the sample might cover the GCM tag. */
636 if ((state & STATE_SUPP_USED) != 0) {
637 size_t i;
638 if ((state & STATE_SUPP_IN_PROCESS) == 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900639 bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128;
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900640 bits4 = _mm_xor_si128(_mm_loadu_si128(supp->input), bits4keys[0]);
641 i = 1;
642 } else {
643 i = 2;
644 }
645 do {
646 bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900647 } while (i != ctx->super.ecb.rounds);
Kazuho Oku4c19f502020-05-15 08:30:35 +0900648 bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]);
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900649 _mm_storeu_si128((__m128i *)supp->output, bits4);
650 }
651
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900652#undef AESECB6_INIT
653#undef AESECB6_UPDATE
654#undef AESECB6_FINAL
655#undef STATE_EK0_BEEN_FOUND
656#undef STATE_EK0_READY
657#undef STATE_SUPP_IN_PROCESS
658}
659
Kazuho Oku07f37c22022-05-10 09:45:42 +0900660int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr,
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900661 const void *_aad, size_t aadlen, const void *tag)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900662{
Kazuho Oku07f37c22022-05-10 09:45:42 +0900663 struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx;
Kazuho Okuba2b9602020-05-14 08:21:39 +0900664 __m128i ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(),
665 bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128();
Kazuho Oku680ce182022-05-09 15:53:18 +0900666 struct ptls_fusion_gfmul_state128 gstate = {0};
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900667 __m128i gdatabuf[6];
Kazuho Oku680ce182022-05-09 15:53:18 +0900668 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900669 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900670
671 const __m128i *gdata; // points to the elements fed into GHASH
672 size_t gdata_cnt;
673
Kazuho Oku94feca22020-05-11 16:34:44 +0900674 const __m128i *src_ghash = input, *src_aes = input, *aad = _aad;
675 __m128i *dst = output;
676 size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900677
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900678 /* schedule ek0 and suppkey */
Kazuho Okuba2b9602020-05-14 08:21:39 +0900679 ctr = _mm_add_epi64(ctr, one8);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900680 bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), ctx->super.ecb.keys.m128[0]);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900681 ++nondata_aes_cnt;
Kazuho Oku91c3b182020-05-10 05:25:48 +0900682
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900683#define STATE_IS_FIRST_RUN 0x1
684#define STATE_GHASH_HAS_MORE 0x2
685 int state = STATE_IS_FIRST_RUN | STATE_GHASH_HAS_MORE;
Kazuho Okufa13ede2020-05-06 16:19:57 +0900686
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900687 /* the main loop */
688 while (1) {
689
690 /* setup gdata */
691 if (PTLS_UNLIKELY(aadlen != 0)) {
692 gdata = gdatabuf;
693 gdata_cnt = 0;
694 while (gdata_cnt < 6) {
695 if (aadlen < 16) {
696 if (aadlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900697 gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900698 aadlen = 0;
699 ++nondata_aes_cnt;
700 }
701 goto GdataFillSrc;
702 }
703 gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
704 aadlen -= 16;
705 ++nondata_aes_cnt;
706 }
707 } else if (PTLS_LIKELY(src_ghashlen >= 6 * 16)) {
708 gdata = src_ghash;
709 gdata_cnt = 6;
710 src_ghash += 6;
711 src_ghashlen -= 6 * 16;
712 } else {
713 gdata = gdatabuf;
714 gdata_cnt = 0;
715 GdataFillSrc:
716 while (gdata_cnt < 6) {
717 if (src_ghashlen < 16) {
718 if (src_ghashlen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900719 gdatabuf[gdata_cnt++] = loadn128(src_ghash, src_ghashlen);
Kazuho Oku94feca22020-05-11 16:34:44 +0900720 src_ghash = (__m128i *)((uint8_t *)src_ghash + src_ghashlen);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900721 src_ghashlen = 0;
722 }
Kazuho Oku94feca22020-05-11 16:34:44 +0900723 if (gdata_cnt < 6 && (state & STATE_GHASH_HAS_MORE) != 0) {
724 gdatabuf[gdata_cnt++] = ac;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900725 state &= ~STATE_GHASH_HAS_MORE;
726 }
727 break;
728 }
729 gdatabuf[gdata_cnt++] = _mm_loadu_si128(src_ghash++);
730 src_ghashlen -= 16;
731 }
732 }
733
734 /* setup aes bits */
735 if (PTLS_LIKELY(nondata_aes_cnt == 0))
736 goto InitAllBits;
737 switch (nondata_aes_cnt) {
Kazuho Oku66a95e52020-05-12 12:57:37 +0900738#define INIT_BITS(n, keys) \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900739 case n: \
Kazuho Okuba2b9602020-05-14 08:21:39 +0900740 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +0900741 bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), keys.m128[0]);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900742 InitAllBits:
Kazuho Oku07f37c22022-05-10 09:45:42 +0900743 INIT_BITS(0, ctx->super.ecb.keys);
744 INIT_BITS(1, ctx->super.ecb.keys);
745 INIT_BITS(2, ctx->super.ecb.keys);
746 INIT_BITS(3, ctx->super.ecb.keys);
747 INIT_BITS(4, ctx->super.ecb.keys);
748 INIT_BITS(5, ctx->super.ecb.keys);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900749#undef INIT_BITS
750 }
751
752 { /* run aes and ghash */
753#define AESECB6_UPDATE(i) \
754 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +0900755 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900756 bits0 = _mm_aesenc_si128(bits0, k); \
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900757 bits1 = _mm_aesenc_si128(bits1, k); \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900758 bits2 = _mm_aesenc_si128(bits2, k); \
759 bits3 = _mm_aesenc_si128(bits3, k); \
760 bits4 = _mm_aesenc_si128(bits4, k); \
761 bits5 = _mm_aesenc_si128(bits5, k); \
762 } while (0)
763
764 size_t aesi;
765 for (aesi = 1; aesi <= gdata_cnt; ++aesi) {
766 AESECB6_UPDATE(aesi);
Kazuho Oku680ce182022-05-09 15:53:18 +0900767 gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900768 }
Kazuho Oku07f37c22022-05-10 09:45:42 +0900769 for (; aesi < ctx->super.ecb.rounds; ++aesi)
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900770 AESECB6_UPDATE(aesi);
Kazuho Oku07f37c22022-05-10 09:45:42 +0900771 __m128i k = ctx->super.ecb.keys.m128[aesi];
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900772 bits0 = _mm_aesenclast_si128(bits0, k);
Kazuho Oku1cf91f62020-05-13 15:11:14 +0900773 bits1 = _mm_aesenclast_si128(bits1, k);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900774 bits2 = _mm_aesenclast_si128(bits2, k);
775 bits3 = _mm_aesenclast_si128(bits3, k);
776 bits4 = _mm_aesenclast_si128(bits4, k);
777 bits5 = _mm_aesenclast_si128(bits5, k);
778
779#undef AESECB6_UPDATE
780 }
781
782 /* apply aes bits */
783 if (PTLS_LIKELY(nondata_aes_cnt == 0 && src_aeslen >= 6 * 16)) {
784#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src_aes + i), bits##i))
785 APPLY(0);
786 APPLY(1);
787 APPLY(2);
788 APPLY(3);
789 APPLY(4);
790 APPLY(5);
791#undef APPLY
792 dst += 6;
793 src_aes += 6;
794 src_aeslen -= 6 * 16;
795 } else {
796 if ((state & STATE_IS_FIRST_RUN) != 0) {
797 ek0 = bits0;
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900798 state &= ~STATE_IS_FIRST_RUN;
799 }
800 switch (nondata_aes_cnt) {
801#define APPLY(i) \
802 case i: \
Kazuho Oku94feca22020-05-11 16:34:44 +0900803 if (PTLS_LIKELY(src_aeslen > 16)) { \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900804 _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \
805 src_aeslen -= 16; \
806 } else { \
Kazuho Okuea21c502020-05-18 16:25:53 +0900807 bits0 = bits##i; \
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900808 goto Finish; \
809 }
810 APPLY(0);
811 APPLY(1);
812 APPLY(2);
813 APPLY(3);
814 APPLY(4);
815 APPLY(5);
816#undef APPLY
817 }
818 nondata_aes_cnt = 0;
819 }
820 }
821
822Finish:
Kazuho Okuea21c502020-05-18 16:25:53 +0900823 if (src_aeslen == 16) {
824 _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits0));
825 } else if (src_aeslen != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900826 storen128(dst, src_aeslen, _mm_xor_si128(loadn128(src_aes, src_aeslen), bits0));
Kazuho Okuea21c502020-05-18 16:25:53 +0900827 }
828
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900829 assert((state & STATE_IS_FIRST_RUN) == 0);
830
831 /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */
832 if ((state & STATE_GHASH_HAS_MORE) != 0) {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900833 assert(ghash_precompute - 1 == ctx->ghash);
Kazuho Oku680ce182022-05-09 15:53:18 +0900834 gfmul_nextstep128(&gstate, ac, --ghash_precompute);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900835 }
836
Kazuho Oku680ce182022-05-09 15:53:18 +0900837 gfmul_reduce128(&gstate);
838 __m128i calctag = gfmul_get_tag128(&gstate, ek0);
Kazuho Oku8b4dfee2020-05-10 12:09:03 +0900839
840 return _mm_movemask_epi8(_mm_cmpeq_epi8(calctag, _mm_loadu_si128(tag))) == 0xffff;
841
842#undef STATE_IS_FIRST_RUN
843#undef STATE_GHASH_HAS_MORE
Kazuho Okufa13ede2020-05-06 16:19:57 +0900844}
845
Kazuho Oku4c19f502020-05-15 08:30:35 +0900846static __m128i expand_key(__m128i key, __m128i temp)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900847{
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900848 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
849 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
850 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
Kazuho Oku4c19f502020-05-15 08:30:35 +0900851
852 key = _mm_xor_si128(key, temp);
853
854 return key;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900855}
856
Kazuho Oku65d3e792022-06-29 12:04:02 +0900857void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size, int aesni256)
Kazuho Okue46529c2020-05-08 13:38:39 +0900858{
Kazuho Oku4c19f502020-05-15 08:30:35 +0900859 assert(is_enc && "decryption is not supported (yet)");
860
Kazuho Okue46529c2020-05-08 13:38:39 +0900861 size_t i = 0;
862
Kazuho Oku4c19f502020-05-15 08:30:35 +0900863 switch (key_size) {
864 case 16: /* AES128 */
865 ctx->rounds = 10;
866 break;
867 case 32: /* AES256 */
868 ctx->rounds = 14;
869 break;
870 default:
871 assert(!"invalid key size; AES128 / AES256 are supported");
872 break;
873 }
Kazuho Oku65d3e792022-06-29 12:04:02 +0900874 ctx->aesni256 = aesni256;
Kazuho Oku4c19f502020-05-15 08:30:35 +0900875
Kazuho Oku680ce182022-05-09 15:53:18 +0900876 /* load and expand keys using keys.m128 */
877 ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key);
Kazuho Oku4c19f502020-05-15 08:30:35 +0900878 if (key_size == 32)
Kazuho Oku680ce182022-05-09 15:53:18 +0900879 ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key + 1);
880 while (1) {
Kazuho Okue46529c2020-05-08 13:38:39 +0900881#define EXPAND(R) \
Kazuho Oku680ce182022-05-09 15:53:18 +0900882 { \
883 ctx->keys.m128[i] = \
884 expand_key(ctx->keys.m128[i - key_size / 16], \
885 _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(3, 3, 3, 3))); \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900886 if (i == ctx->rounds) \
Kazuho Oku680ce182022-05-09 15:53:18 +0900887 break; \
Kazuho Okue46529c2020-05-08 13:38:39 +0900888 ++i; \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900889 if (key_size > 24) { \
Kazuho Oku680ce182022-05-09 15:53:18 +0900890 ctx->keys.m128[i] = \
891 expand_key(ctx->keys.m128[i - key_size / 16], \
892 _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(2, 2, 2, 2))); \
Kazuho Oku4c19f502020-05-15 08:30:35 +0900893 ++i; \
894 } \
Kazuho Oku680ce182022-05-09 15:53:18 +0900895 }
896 EXPAND(0x1);
897 EXPAND(0x2);
898 EXPAND(0x4);
899 EXPAND(0x8);
900 EXPAND(0x10);
901 EXPAND(0x20);
902 EXPAND(0x40);
903 EXPAND(0x80);
904 EXPAND(0x1b);
905 EXPAND(0x36);
Kazuho Okue46529c2020-05-08 13:38:39 +0900906#undef EXPAND
Kazuho Oku680ce182022-05-09 15:53:18 +0900907 }
908
Kazuho Oku65d3e792022-06-29 12:04:02 +0900909 /* convert to keys.m256 if aesni256 is used */
910 if (ctx->aesni256) {
Kazuho Oku680ce182022-05-09 15:53:18 +0900911 size_t i = ctx->rounds;
912 do {
913 ctx->keys.m256[i] = _mm256_broadcastsi128_si256(ctx->keys.m128[i]);
914 } while (i-- != 0);
915 }
Kazuho Okue46529c2020-05-08 13:38:39 +0900916}
917
918void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx)
919{
920 ptls_clear_memory(ctx, sizeof(*ctx));
921}
922
Kazuho Oku4c19f502020-05-15 08:30:35 +0900923void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src)
924{
925 __m128i v = _mm_loadu_si128(src);
926 v = aesecb_encrypt(ctx, v);
927 _mm_storeu_si128(dst, v);
928}
929
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900930/**
931 * returns the number of ghash entries that is required to handle an AEAD block of given size
932 */
933static size_t aesgcm_calc_ghash_cnt(size_t capacity)
934{
935 // round-up by block size, add to handle worst split of the size between AAD and payload, plus context to hash AC
936 return (capacity + 15) / 16 + 2;
937}
938
939static void setup_one_ghash_entry(ptls_fusion_aesgcm_context_t *ctx)
940{
Kazuho Oku680ce182022-05-09 15:53:18 +0900941 __m128i *H, *r, *Hprev, H0;
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900942
Kazuho Oku65d3e792022-06-29 12:04:02 +0900943 if (ctx->ecb.aesni256) {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900944 struct ptls_fusion_aesgcm_context256 *ctx256 = (void *)ctx;
945#define GET_SLOT(i, mem) (&ctx256->ghash[(i) / 2].mem[(i) % 2 == 0])
Kazuho Oku680ce182022-05-09 15:53:18 +0900946 H = GET_SLOT(ctx->ghash_cnt, H);
947 r = GET_SLOT(ctx->ghash_cnt, r);
948 Hprev = ctx->ghash_cnt == 0 ? NULL : GET_SLOT(ctx->ghash_cnt - 1, H);
949#undef GET_SLOT
Kazuho Oku07f37c22022-05-10 09:45:42 +0900950 H0 = ctx256->ghash[0].H[1];
Kazuho Oku680ce182022-05-09 15:53:18 +0900951 } else {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900952 struct ptls_fusion_aesgcm_context128 *ctx128 = (void *)ctx;
953 H = &ctx128->ghash[ctx->ghash_cnt].H;
954 r = &ctx128->ghash[ctx->ghash_cnt].r;
955 Hprev = ctx->ghash_cnt == 0 ? NULL : &ctx128->ghash[ctx->ghash_cnt - 1].H;
956 H0 = ctx128->ghash[0].H;
Kazuho Oku680ce182022-05-09 15:53:18 +0900957 }
958
959 if (Hprev != NULL)
960 *H = gfmul(*Hprev, H0);
961
962 *r = _mm_shuffle_epi32(*H, 78);
963 *r = _mm_xor_si128(*r, *H);
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900964
965 ++ctx->ghash_cnt;
966}
967
Kazuho Oku65d3e792022-06-29 12:04:02 +0900968static size_t calc_aesgcm_context_size(size_t *ghash_cnt, int aesni256)
Kazuho Oku07f37c22022-05-10 09:45:42 +0900969{
970 size_t sz;
971
Kazuho Oku65d3e792022-06-29 12:04:02 +0900972 if (aesni256) {
Kazuho Oku07f37c22022-05-10 09:45:42 +0900973 if (*ghash_cnt % 2 != 0)
974 ++*ghash_cnt;
975 sz = offsetof(struct ptls_fusion_aesgcm_context256, ghash) +
976 sizeof(union ptls_fusion_aesgcm_ghash_precompute256) * *ghash_cnt / 2;
977 } else {
978 sz = offsetof(struct ptls_fusion_aesgcm_context128, ghash) +
979 sizeof(struct ptls_fusion_aesgcm_ghash_precompute128) * *ghash_cnt;
980 }
981 return sz;
982}
983
Kazuho Oku65d3e792022-06-29 12:04:02 +0900984static ptls_fusion_aesgcm_context_t *new_aesgcm(const void *key, size_t key_size, size_t capacity, int aesni256)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900985{
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900986 ptls_fusion_aesgcm_context_t *ctx;
Kazuho Oku65d3e792022-06-29 12:04:02 +0900987 size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity), ctx_size = calc_aesgcm_context_size(&ghash_cnt, aesni256);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900988
Kazuho Oku07f37c22022-05-10 09:45:42 +0900989 if ((ctx = aligned_alloc(32, ctx_size)) == NULL)
Kazuho Okuf198c1b2020-05-08 00:45:29 +0900990 return NULL;
991
Kazuho Oku65d3e792022-06-29 12:04:02 +0900992 ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size, aesni256);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +0900993
Kazuho Oku7fd7c842020-05-18 14:04:42 +0900994 ctx->capacity = capacity;
995
Kazuho Oku680ce182022-05-09 15:53:18 +0900996 __m128i H0 = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128());
997 H0 = _mm_shuffle_epi8(H0, byteswap128);
998 H0 = transformH(H0);
Kazuho Oku65d3e792022-06-29 12:04:02 +0900999 if (ctx->ecb.aesni256) {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001000 ((struct ptls_fusion_aesgcm_context256 *)ctx)->ghash[0].H[1] = H0;
Kazuho Oku680ce182022-05-09 15:53:18 +09001001 } else {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001002 ((struct ptls_fusion_aesgcm_context128 *)ctx)->ghash[0].H = H0;
Kazuho Oku680ce182022-05-09 15:53:18 +09001003 }
1004
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001005 ctx->ghash_cnt = 0;
1006 while (ctx->ghash_cnt < ghash_cnt)
1007 setup_one_ghash_entry(ctx);
1008
1009 return ctx;
1010}
1011
Kazuho Oku680ce182022-05-09 15:53:18 +09001012ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity)
1013{
1014 return new_aesgcm(key, key_size, capacity, 0);
1015}
1016
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001017ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity)
1018{
Kazuho Oku75e71f52022-09-15 09:43:03 +09001019 size_t new_ghash_cnt = aesgcm_calc_ghash_cnt(capacity);
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001020
Kazuho Oku75e71f52022-09-15 09:43:03 +09001021 if (new_ghash_cnt <= ctx->ghash_cnt)
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001022 return ctx;
1023
Kazuho Oku998f2e02022-09-15 13:10:26 +09001024 size_t new_ctx_size = calc_aesgcm_context_size(&new_ghash_cnt, ctx->ecb.aesni256),
1025 old_ctx_size = calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.aesni256);
1026
Kazuho Oku688d70c2022-05-11 13:46:22 +09001027 ptls_fusion_aesgcm_context_t *newp;
Kazuho Oku75e71f52022-09-15 09:43:03 +09001028 if ((newp = aligned_alloc(32, new_ctx_size)) == NULL)
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001029 return NULL;
Kazuho Oku998f2e02022-09-15 13:10:26 +09001030 memcpy(newp, ctx, old_ctx_size);
1031 ptls_clear_memory(ctx, old_ctx_size);
Kazuho Okuba56a5d2022-12-12 09:24:11 +09001032 aligned_free(ctx);
Kazuho Oku688d70c2022-05-11 13:46:22 +09001033 ctx = newp;
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001034
1035 ctx->capacity = capacity;
Kazuho Oku75e71f52022-09-15 09:43:03 +09001036 while (ctx->ghash_cnt < new_ghash_cnt)
Kazuho Oku7fd7c842020-05-18 14:04:42 +09001037 setup_one_ghash_entry(ctx);
Kazuho Okuf198c1b2020-05-08 00:45:29 +09001038
1039 return ctx;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001040}
1041
Kazuho Oku31ebd7d2020-05-15 06:37:23 +09001042void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001043{
Kazuho Oku65d3e792022-06-29 12:04:02 +09001044 ptls_clear_memory(ctx, calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.aesni256));
Kazuho Oku07f37c22022-05-10 09:45:42 +09001045 /* skip ptls_fusion_aesecb_dispose, based on the knowledge that it does not allocate memory elsewhere */
1046
Kazuho Okuba56a5d2022-12-12 09:24:11 +09001047 aligned_free(ctx);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001048}
1049
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001050static void ctr_dispose(ptls_cipher_context_t *_ctx)
1051{
1052 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1053 ptls_fusion_aesecb_dispose(&ctx->fusion);
1054 _mm_storeu_si128(&ctx->bits, _mm_setzero_si128());
1055}
1056
1057static void ctr_init(ptls_cipher_context_t *_ctx, const void *iv)
1058{
1059 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1060 _mm_storeu_si128(&ctx->bits, aesecb_encrypt(&ctx->fusion, _mm_loadu_si128(iv)));
1061 ctx->is_ready = 1;
1062}
1063
1064static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void *input, size_t len)
1065{
1066 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1067
1068 assert((ctx->is_ready && len <= 16) ||
1069 !"CTR transfomation is supported only once per call to `init` and the maximum size is limited to 16 bytes");
1070 ctx->is_ready = 0;
1071
1072 if (len < 16) {
Kazuho Oku680ce182022-05-09 15:53:18 +09001073 storen128(output, len, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), loadn128(input, len)));
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001074 } else {
1075 _mm_storeu_si128(output, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), _mm_loadu_si128(input)));
1076 }
1077}
1078
Kazuho Oku6b849782020-05-15 11:44:21 +09001079static int aesctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size)
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001080{
1081 struct ctr_context *ctx = (struct ctr_context *)_ctx;
1082
1083 ctx->super.do_dispose = ctr_dispose;
1084 ctx->super.do_init = ctr_init;
1085 ctx->super.do_transform = ctr_transform;
Kazuho Oku65d3e792022-06-29 12:04:02 +09001086 ptls_fusion_aesecb_init(&ctx->fusion, 1, key, key_size, 0 /* probably we do not need aesni256 for CTR? */);
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001087 ctx->is_ready = 0;
1088
1089 return 0;
1090}
1091
Kazuho Oku6b849782020-05-15 11:44:21 +09001092static int aes128ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
1093{
1094 return aesctr_setup(ctx, is_enc, key, PTLS_AES128_KEY_SIZE);
1095}
1096
1097static int aes256ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
1098{
1099 return aesctr_setup(ctx, is_enc, key, PTLS_AES256_KEY_SIZE);
1100}
1101
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001102static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx)
1103{
1104 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
1105
Kazuho Oku31ebd7d2020-05-15 06:37:23 +09001106 ptls_fusion_aesgcm_free(ctx->aesgcm);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001107}
1108
Kazuho Okuea42ef72022-05-02 07:40:01 +09001109static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen)
1110{
1111 assert(!"FIXME");
1112}
1113
1114static size_t aead_do_encrypt_update(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen)
1115{
1116 assert(!"FIXME");
1117 return SIZE_MAX;
1118}
1119
1120static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output)
1121{
1122 assert(!"FIXME");
1123 return SIZE_MAX;
1124}
1125
Kazuho Okuba2b9602020-05-14 08:21:39 +09001126static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001127{
Kazuho Okuba2b9602020-05-14 08:21:39 +09001128 __m128i ctr = _mm_setzero_si128();
1129 ctr = _mm_insert_epi64(ctr, seq, 0);
Kazuho Oku076982f2020-05-14 09:28:44 +09001130 ctr = _mm_slli_si128(ctr, 4);
Kazuho Okuba2b9602020-05-14 08:21:39 +09001131 ctr = _mm_xor_si128(ctx->static_iv, ctr);
1132 return ctr;
Kazuho Oku94feca22020-05-11 16:34:44 +09001133}
1134
Kazuho Okuba2b9602020-05-14 08:21:39 +09001135void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
Kazuho Okuea42ef72022-05-02 07:40:01 +09001136 const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp)
Kazuho Okuba2b9602020-05-14 08:21:39 +09001137{
1138 struct aesgcm_context *ctx = (void *)_ctx;
1139
Kazuho Okuea42ef72022-05-02 07:40:01 +09001140 if (inlen + aadlen > ctx->aesgcm->capacity)
1141 ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, inlen + aadlen);
1142 ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad, aadlen, supp);
Kazuho Oku3a50ee12022-04-27 16:20:16 +09001143}
1144
1145static void aead_do_encrypt_v(struct st_ptls_aead_context_t *ctx, void *output, ptls_iovec_t *input, size_t incnt, uint64_t seq,
Kazuho Okuea42ef72022-05-02 07:40:01 +09001146 const void *aad, size_t aadlen)
Kazuho Oku3a50ee12022-04-27 16:20:16 +09001147{
1148 assert(!"FIXME");
Kazuho Okuba2b9602020-05-14 08:21:39 +09001149}
1150
1151static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
Kazuho Okuea42ef72022-05-02 07:40:01 +09001152 const void *aad, size_t aadlen)
Kazuho Oku94feca22020-05-11 16:34:44 +09001153{
Kazuho Okuba2b9602020-05-14 08:21:39 +09001154 struct aesgcm_context *ctx = (void *)_ctx;
1155
1156 if (inlen < 16)
1157 return SIZE_MAX;
Kazuho Oku94feca22020-05-11 16:34:44 +09001158
1159 size_t enclen = inlen - 16;
Kazuho Okuea42ef72022-05-02 07:40:01 +09001160 if (enclen + aadlen > ctx->aesgcm->capacity)
1161 ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, enclen + aadlen);
1162 if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad, aadlen,
Kazuho Okuba2b9602020-05-14 08:21:39 +09001163 (const uint8_t *)input + enclen))
Kazuho Oku94feca22020-05-11 16:34:44 +09001164 return SIZE_MAX;
1165 return enclen;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001166}
1167
Kazuho Okubb2cac22023-02-13 16:09:08 +09001168static inline void aesgcm_get_iv(ptls_aead_context_t *_ctx, void *iv)
Christian Huitema4f8c4852020-12-05 20:31:30 -08001169{
1170 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
Kazuho Okubb2cac22023-02-13 16:09:08 +09001171
1172 __m128i m128 = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
1173 storen128(iv, PTLS_AESGCM_IV_SIZE, m128);
1174}
1175
1176static inline void aesgcm_set_iv(ptls_aead_context_t *_ctx, const void *iv)
1177{
1178 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
1179
1180 ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
1181 ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
Christian Huitema4f8c4852020-12-05 20:31:30 -08001182}
1183
Kazuho Oku6b849782020-05-15 11:44:21 +09001184static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size)
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001185{
1186 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
1187
Kazuho Oku680ce182022-05-09 15:53:18 +09001188 ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
1189 ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
Kazuho Okuae2aeda2020-06-14 15:13:18 +09001190 if (key == NULL)
1191 return 0;
Kazuho Okuba2b9602020-05-14 08:21:39 +09001192
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001193 ctx->super.dispose_crypto = aesgcm_dispose_crypto;
Kazuho Okubb2cac22023-02-13 16:09:08 +09001194 ctx->super.do_get_iv = aesgcm_get_iv;
1195 ctx->super.do_set_iv = aesgcm_set_iv;
Kazuho Okuea42ef72022-05-02 07:40:01 +09001196 ctx->super.do_encrypt_init = aead_do_encrypt_init;
1197 ctx->super.do_encrypt_update = aead_do_encrypt_update;
1198 ctx->super.do_encrypt_final = aead_do_encrypt_final;
Kazuho Okuba2b9602020-05-14 08:21:39 +09001199 ctx->super.do_encrypt = aead_do_encrypt;
Kazuho Oku3a50ee12022-04-27 16:20:16 +09001200 ctx->super.do_encrypt_v = aead_do_encrypt_v;
Kazuho Okuba2b9602020-05-14 08:21:39 +09001201 ctx->super.do_decrypt = aead_do_decrypt;
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001202
Kazuho Oku65d3e792022-06-29 12:04:02 +09001203 ctx->aesgcm = new_aesgcm(key, key_size, 1500 /* assume ordinary packet size */, 0 /* no support for aesni256 yet */);
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001204
1205 return 0;
1206}
1207
Kazuho Oku6b849782020-05-15 11:44:21 +09001208static int aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
1209{
1210 return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE);
1211}
1212
1213static int aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
1214{
1215 return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE);
1216}
1217
Kazuho Oku65d3e792022-06-29 12:04:02 +09001218int ptls_fusion_can_aesni256 = 0;
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001219ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR",
1220 PTLS_AES128_KEY_SIZE,
1221 1, // block size
1222 PTLS_AES_IV_SIZE,
1223 sizeof(struct ctr_context),
1224 aes128ctr_setup};
Kazuho Oku6b849782020-05-15 11:44:21 +09001225ptls_cipher_algorithm_t ptls_fusion_aes256ctr = {"AES256-CTR",
1226 PTLS_AES256_KEY_SIZE,
1227 1, // block size
1228 PTLS_AES_IV_SIZE,
1229 sizeof(struct ctr_context),
1230 aes256ctr_setup};
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001231ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM",
Christian Huitema11b75d52020-09-11 23:01:38 -07001232 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
1233 PTLS_AESGCM_INTEGRITY_LIMIT,
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001234 &ptls_fusion_aes128ctr,
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001235 NULL, // &ptls_fusion_aes128ecb,
1236 PTLS_AES128_KEY_SIZE,
1237 PTLS_AESGCM_IV_SIZE,
1238 PTLS_AESGCM_TAG_SIZE,
Kazuho Oku93944ce2022-07-06 16:41:08 +09001239 {0}, // while it may work, no reason to support TLS/1.2
Kazuho Oku9dc69822022-06-08 13:25:52 +09001240 0,
Kazuho Oku1edf7072022-07-11 10:30:43 +09001241 0,
Kazuho Oku32f6c7b2020-05-05 22:14:41 +09001242 sizeof(struct aesgcm_context),
Kazuho Oku9f2fb302020-05-11 13:13:26 +09001243 aes128gcm_setup};
Kazuho Oku6b849782020-05-15 11:44:21 +09001244ptls_aead_algorithm_t ptls_fusion_aes256gcm = {"AES256-GCM",
Christian Huitema11b75d52020-09-11 23:01:38 -07001245 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
1246 PTLS_AESGCM_INTEGRITY_LIMIT,
Kazuho Oku6b849782020-05-15 11:44:21 +09001247 &ptls_fusion_aes256ctr,
1248 NULL, // &ptls_fusion_aes256ecb,
1249 PTLS_AES256_KEY_SIZE,
1250 PTLS_AESGCM_IV_SIZE,
1251 PTLS_AESGCM_TAG_SIZE,
Kazuho Oku93944ce2022-07-06 16:41:08 +09001252 {0}, // while it may work, no reason to support TLS/1.2
Kazuho Oku9dc69822022-06-08 13:25:52 +09001253 0,
Kazuho Oku1edf7072022-07-11 10:30:43 +09001254 0,
Kazuho Oku6b849782020-05-15 11:44:21 +09001255 sizeof(struct aesgcm_context),
1256 aes256gcm_setup};
Kazuho Oku3ee790b2020-05-15 03:35:03 +09001257
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001258static inline size_t calc_total_length(ptls_iovec_t *input, size_t incnt)
1259{
1260 size_t totlen = 0;
1261 for (size_t i = 0; i < incnt; ++i)
1262 totlen += input[i].len;
1263 return totlen;
1264}
1265
Kazuho Okua7006dc2022-05-09 17:24:31 +09001266static inline void reduce_aad128(struct ptls_fusion_gfmul_state128 *gstate, struct ptls_fusion_aesgcm_ghash_precompute128 *ghash,
1267 const void *_aad, size_t aadlen)
1268{
1269 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute;
1270 const uint8_t *aad = _aad;
1271
1272 while (PTLS_UNLIKELY(aadlen >= 6 * 16)) {
1273 ghash_precompute = ghash + 6;
1274 gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1275 aad += 16;
1276 aadlen -= 16;
1277 for (int i = 1; i < 6; ++i) {
1278 gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1279 aad += 16;
1280 aadlen -= 16;
1281 }
1282 gfmul_reduce128(gstate);
1283 }
1284
1285 if (PTLS_LIKELY(aadlen != 0)) {
1286 ghash_precompute = ghash + (aadlen + 15) / 16;
1287 if (PTLS_UNLIKELY(aadlen >= 16)) {
1288 gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1289 aad += 16;
1290 aadlen -= 16;
1291 while (aadlen >= 16) {
1292 gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
1293 aad += 16;
1294 aadlen -= 16;
1295 }
1296 if (PTLS_LIKELY(aadlen != 0))
1297 gfmul_nextstep128(gstate, loadn128(aad, aadlen), --ghash_precompute);
1298 } else {
1299 gfmul_firststep128(gstate, loadn128(aad, aadlen), --ghash_precompute);
1300 }
1301 assert(ghash == ghash_precompute);
1302 gfmul_reduce128(gstate);
1303 }
1304}
1305
Kazuho Oku13ced822022-05-10 10:09:54 +09001306NO_SANITIZE_ADDRESS
1307static inline uint8_t *load_preceding_unaligned(uint8_t *encbuf, uint8_t **output)
1308{
1309 uint8_t *encp;
1310
1311 if ((encp = encbuf + ((uintptr_t)*output & 63)) != encbuf) {
1312 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(*output - (encp - encbuf))));
1313 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(*output - (encp - encbuf) + 32)));
1314 *output -= encp - encbuf;
1315 }
1316
1317 return encp;
1318}
1319
1320NO_SANITIZE_ADDRESS
Kazuho Oku680ce182022-05-09 15:53:18 +09001321static inline void write_remaining_bytes(uint8_t *dst, const uint8_t *src, const uint8_t *end)
1322{
1323 /* Write in 64-byte chunks, using NT store instructions. Last partial block, if any, is written to cache, as that cache line
1324 * would likely be read when the next TLS record is being built. */
1325
1326 for (; end - src >= 64; dst += 64, src += 64) {
1327 _mm256_stream_si256((void *)dst, _mm256_load_si256((void *)src));
1328 _mm256_stream_si256((void *)(dst + 32), _mm256_load_si256((void *)(src + 32)));
1329 }
1330 _mm_sfence(); /* weakly ordered writes have to be synced before being passed to NIC */
1331 if (src != end) {
1332 for (; end - src >= 16; dst += 16, src += 16)
1333 _mm_store_si128((void *)dst, _mm_load_si128((void *)src));
1334 if (src != end)
1335 storen128((void *)dst, end - src, loadn128((void *)src, end - src));
1336 }
1337}
1338
Kazuho Oku45439822022-05-10 10:16:49 +09001339NO_SANITIZE_ADDRESS
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001340static void non_temporal_encrypt_v128(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt,
Kazuho Okua7006dc2022-05-09 17:24:31 +09001341 uint64_t seq, const void *aad, size_t aadlen)
Kazuho Okued661b12022-04-30 21:43:31 +09001342{
1343/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
1344#define AESECB6_INIT() \
1345 do { \
1346 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001347 bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001348 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001349 bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001350 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001351 bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001352 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001353 bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001354 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001355 bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001356 if (PTLS_LIKELY(srclen > 16 * 5) || src_vecleft != 0) { \
Kazuho Okued661b12022-04-30 21:43:31 +09001357 ctr = _mm_add_epi64(ctr, one8); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001358 bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
Kazuho Okued661b12022-04-30 21:43:31 +09001359 } else { \
Kazuho Oku4f6bcae2022-05-09 07:02:13 +09001360 bits5 = ek0; \
1361 state |= STATE_EK0_READY; \
Kazuho Okued661b12022-04-30 21:43:31 +09001362 } \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001363 __m128i k = ctx->super.ecb.keys.m128[0]; \
Kazuho Okued661b12022-04-30 21:43:31 +09001364 bits0 = _mm_xor_si128(bits0, k); \
1365 bits1 = _mm_xor_si128(bits1, k); \
1366 bits2 = _mm_xor_si128(bits2, k); \
1367 bits3 = _mm_xor_si128(bits3, k); \
1368 bits4 = _mm_xor_si128(bits4, k); \
1369 bits5 = _mm_xor_si128(bits5, k); \
1370 } while (0)
1371
1372/* aes block update */
1373#define AESECB6_UPDATE(i) \
1374 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001375 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Okued661b12022-04-30 21:43:31 +09001376 bits0 = _mm_aesenc_si128(bits0, k); \
1377 bits1 = _mm_aesenc_si128(bits1, k); \
1378 bits2 = _mm_aesenc_si128(bits2, k); \
1379 bits3 = _mm_aesenc_si128(bits3, k); \
1380 bits4 = _mm_aesenc_si128(bits4, k); \
1381 bits5 = _mm_aesenc_si128(bits5, k); \
1382 } while (0)
1383
1384/* aesenclast */
1385#define AESECB6_FINAL(i) \
1386 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001387 __m128i k = ctx->super.ecb.keys.m128[i]; \
Kazuho Okued661b12022-04-30 21:43:31 +09001388 bits0 = _mm_aesenclast_si128(bits0, k); \
1389 bits1 = _mm_aesenclast_si128(bits1, k); \
1390 bits2 = _mm_aesenclast_si128(bits2, k); \
1391 bits3 = _mm_aesenclast_si128(bits3, k); \
1392 bits4 = _mm_aesenclast_si128(bits4, k); \
1393 bits5 = _mm_aesenclast_si128(bits5, k); \
1394 } while (0)
1395
1396 struct aesgcm_context *agctx = (void *)_ctx;
Kazuho Oku7a0685d2022-05-04 15:39:57 +09001397 uint8_t *output = _output;
Kazuho Okued661b12022-04-30 21:43:31 +09001398
Kazuho Okued661b12022-04-30 21:43:31 +09001399#define STATE_EK0_READY 0x1
Kazuho Okue0caecc2022-05-04 10:09:24 +09001400#define STATE_COPY_128B 0x2
Kazuho Okued661b12022-04-30 21:43:31 +09001401 int32_t state = 0;
1402
Kazuho Oku9f8e12a2022-05-11 08:21:45 +09001403 /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */
1404 uint8_t encbuf[32 * 6] __attribute__((aligned(32))), *encp;
Kazuho Oku122a3342022-05-04 14:08:47 +09001405
1406 /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is
1407 * append to the ciphertext before writing the bytes to main memory using NT store instructions. */
1408 PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 16 + 16);
1409
Kazuho Oku13ced822022-05-10 10:09:54 +09001410 /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
1411 encp = load_preceding_unaligned(encbuf, &output);
1412
Kazuho Okue0caecc2022-05-04 10:09:24 +09001413 /* First write would be 128 bytes (32+6*16), if encbuf contains no less than 32 bytes already. */
1414 if (encp - encbuf >= 32)
1415 state |= STATE_COPY_128B;
Kazuho Okued661b12022-04-30 21:43:31 +09001416
1417 /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */
1418 __m128i ctr = calc_counter(agctx, seq);
1419 ctr = _mm_insert_epi32(ctr, 1, 0);
Kazuho Oku680ce182022-05-09 15:53:18 +09001420 __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128);
1421 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +09001422
Kazuho Oku07f37c22022-05-10 09:45:42 +09001423 struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm;
Kazuho Okued661b12022-04-30 21:43:31 +09001424 __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
Kazuho Oku680ce182022-05-09 15:53:18 +09001425 struct ptls_fusion_gfmul_state128 gstate = {0};
Kazuho Okued661b12022-04-30 21:43:31 +09001426
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001427 /* find the first non-empty vec */
1428 const uint8_t *src = NULL;
1429 size_t srclen = 0, src_vecleft = incnt;
1430 while (srclen == 0 && src_vecleft != 0) {
1431 src = (void *)input[0].base;
1432 srclen = input[0].len;
1433 ++input;
1434 --src_vecleft;
1435 }
1436
Kazuho Okued661b12022-04-30 21:43:31 +09001437 /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
1438 AESECB6_INIT();
1439 AESECB6_UPDATE(1);
1440 AESECB6_UPDATE(2);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001441 reduce_aad128(&gstate, ctx->ghash, aad, aadlen);
1442 for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
Kazuho Okued661b12022-04-30 21:43:31 +09001443 AESECB6_UPDATE(i);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001444 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Okued661b12022-04-30 21:43:31 +09001445
1446 /* Main loop. This loop:
1447 * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf,
1448 * 2. then if there is no more data to be encrypted, exit the loop, otherwise,
1449 * 3. calculate ghash of the blocks being written to encbuf,
1450 * 4. calculate next 6 * 16 bytes of keystream,
1451 * 5. writes encbuf in 64-byte blocks
1452 * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be
1453 * calculated. */
Kazuho Okue0caecc2022-05-04 10:09:24 +09001454 size_t remaining_ghash_from = encp - encbuf;
Kazuho Oku3b2ab612022-05-09 06:41:48 +09001455 if (srclen != 0) {
Kazuho Okued661b12022-04-30 21:43:31 +09001456 while (1) {
1457 /* apply the bit stream to input, writing to encbuf */
1458 if (PTLS_LIKELY(srclen >= 6 * 16)) {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001459#define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(src + i * 16)), bits##i))
Kazuho Okued661b12022-04-30 21:43:31 +09001460 APPLY(0);
1461 APPLY(1);
1462 APPLY(2);
1463 APPLY(3);
1464 APPLY(4);
1465 APPLY(5);
1466#undef APPLY
Kazuho Okue0caecc2022-05-04 10:09:24 +09001467 encp += 6 * 16;
Kazuho Okued661b12022-04-30 21:43:31 +09001468 src += 6 * 16;
1469 srclen -= 6 * 16;
1470 if (PTLS_UNLIKELY(srclen == 0)) {
1471 if (src_vecleft == 0) {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001472 remaining_ghash_from = (encp - encbuf) - 96;
Kazuho Okued661b12022-04-30 21:43:31 +09001473 break;
1474 }
1475 src = (void *)input[0].base;
1476 srclen = input[0].len;
1477 ++input;
1478 --src_vecleft;
1479 }
1480 } else {
1481 /* slow path, load at most 6 * 16 bytes to encbuf then encrypt in-place */
1482 size_t bytes_copied = 0;
1483 do {
Kazuho Okufa3cd322022-05-08 17:05:17 +09001484 if (srclen >= 16 && bytes_copied < 5 * 16) {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001485 _mm_storeu_si128((void *)(encp + bytes_copied), _mm_loadu_si128((void *)src));
Kazuho Okued661b12022-04-30 21:43:31 +09001486 bytes_copied += 16;
1487 src += 16;
1488 srclen -= 16;
1489 } else {
Kazuho Okue0caecc2022-05-04 10:09:24 +09001490 encp[bytes_copied++] = *src++;
Kazuho Okued661b12022-04-30 21:43:31 +09001491 --srclen;
1492 }
1493 if (PTLS_UNLIKELY(srclen == 0)) {
Kazuho Oku78e6c3a2022-06-29 11:17:06 +09001494 do {
1495 if (src_vecleft == 0)
1496 break;
Kazuho Okued661b12022-04-30 21:43:31 +09001497 src = (void *)input[0].base;
1498 srclen = input[0].len;
1499 ++input;
1500 --src_vecleft;
Kazuho Oku78e6c3a2022-06-29 11:17:06 +09001501 } while (srclen == 0);
1502 if (srclen == 0)
1503 break;
Kazuho Okued661b12022-04-30 21:43:31 +09001504 }
1505 } while (bytes_copied < 6 * 16);
Kazuho Okue0caecc2022-05-04 10:09:24 +09001506#define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(encp + i * 16)), bits##i))
Kazuho Okued661b12022-04-30 21:43:31 +09001507 APPLY(0);
1508 APPLY(1);
1509 APPLY(2);
1510 APPLY(3);
1511 APPLY(4);
1512 APPLY(5);
1513#undef APPLY
Kazuho Okue0caecc2022-05-04 10:09:24 +09001514 encp += bytes_copied;
Kazuho Okued661b12022-04-30 21:43:31 +09001515 if (PTLS_UNLIKELY(srclen == 0)) {
1516 /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it
1517 * will be fed into ghash. */
Kazuho Okue0caecc2022-05-04 10:09:24 +09001518 remaining_ghash_from = (encp - encbuf) - bytes_copied;
Kazuho Oku7fb163f2022-05-01 14:09:56 +09001519 if ((bytes_copied & 15) != 0)
Kazuho Okue0caecc2022-05-04 10:09:24 +09001520 _mm_storeu_si128((void *)encp, _mm_setzero_si128());
Kazuho Okued661b12022-04-30 21:43:31 +09001521 break;
1522 }
1523 }
1524
1525 /* Next 96-byte block starts here. Run AES and ghash in while writing output using non-temporal stores in 64-byte
1526 * blocks. */
1527 AESECB6_INIT();
Kazuho Oku07f37c22022-05-10 09:45:42 +09001528 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + 6;
Kazuho Oku680ce182022-05-09 15:53:18 +09001529 gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encp - 6 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001530 AESECB6_UPDATE(1);
Kazuho Oku680ce182022-05-09 15:53:18 +09001531 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 5 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001532 AESECB6_UPDATE(2);
Kazuho Oku680ce182022-05-09 15:53:18 +09001533 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 4 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001534 AESECB6_UPDATE(3);
Kazuho Oku7da09172022-04-30 23:54:59 +09001535 _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf));
1536 _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32)));
Kazuho Oku791036a2022-05-02 11:51:37 +09001537 AESECB6_UPDATE(4);
Kazuho Oku680ce182022-05-09 15:53:18 +09001538 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 3 * 16)), --ghash_precompute);
Kazuho Oku791036a2022-05-02 11:51:37 +09001539 AESECB6_UPDATE(5);
Kazuho Oku680ce182022-05-09 15:53:18 +09001540 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 2 * 16)), --ghash_precompute);
Kazuho Okub854db92022-04-30 23:44:34 +09001541 AESECB6_UPDATE(6);
Kazuho Oku680ce182022-05-09 15:53:18 +09001542 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 1 * 16)), --ghash_precompute);
Kazuho Okued661b12022-04-30 21:43:31 +09001543 AESECB6_UPDATE(7);
Kazuho Okue0caecc2022-05-04 10:09:24 +09001544 if ((state & STATE_COPY_128B) != 0) {
Kazuho Oku7da09172022-04-30 23:54:59 +09001545 _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64)));
1546 _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96)));
Kazuho Okued661b12022-04-30 21:43:31 +09001547 output += 128;
Kazuho Okue0caecc2022-05-04 10:09:24 +09001548 encp -= 128;
Kazuho Oku791036a2022-05-02 11:51:37 +09001549 AESECB6_UPDATE(8);
Kazuho Oku7a0685d2022-05-04 15:39:57 +09001550 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 128)));
1551 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 160)));
Kazuho Okued661b12022-04-30 21:43:31 +09001552 } else {
1553 output += 64;
Kazuho Okue0caecc2022-05-04 10:09:24 +09001554 encp -= 64;
Kazuho Oku7a0685d2022-05-04 15:39:57 +09001555 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 64)));
1556 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 96)));
Kazuho Oku791036a2022-05-02 11:51:37 +09001557 AESECB6_UPDATE(8);
Kazuho Okued661b12022-04-30 21:43:31 +09001558 }
Kazuho Okue0caecc2022-05-04 10:09:24 +09001559 state ^= STATE_COPY_128B;
Kazuho Oku791036a2022-05-02 11:51:37 +09001560 AESECB6_UPDATE(9);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001561 if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
1562 for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i)
Kazuho Oku59983e92022-05-02 12:04:47 +09001563 AESECB6_UPDATE(i);
1564 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001565 assert(ctx->ghash == ghash_precompute);
Kazuho Oku680ce182022-05-09 15:53:18 +09001566 gfmul_reduce128(&gstate);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001567 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Okued661b12022-04-30 21:43:31 +09001568 }
1569 }
1570
1571 /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */
1572
1573 { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7
1574 * blocks at once. */
Kazuho Okue0caecc2022-05-04 10:09:24 +09001575 size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16;
Kazuho Oku7fb163f2022-05-01 14:09:56 +09001576 _mm_storeu_si128((void *)(encbuf + ac_off), ac);
Kazuho Okue0caecc2022-05-04 10:09:24 +09001577 size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */
Kazuho Okued661b12022-04-30 21:43:31 +09001578 assert(blocks <= 7);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001579 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + blocks;
Kazuho Oku680ce182022-05-09 15:53:18 +09001580 gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
Kazuho Okued661b12022-04-30 21:43:31 +09001581 remaining_ghash_from += 16;
Kazuho Oku07f37c22022-05-10 09:45:42 +09001582 while (ghash_precompute != ctx->ghash) {
Kazuho Oku680ce182022-05-09 15:53:18 +09001583 gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
Kazuho Okued661b12022-04-30 21:43:31 +09001584 remaining_ghash_from += 16;
1585 }
Kazuho Oku680ce182022-05-09 15:53:18 +09001586 gfmul_reduce128(&gstate);
Kazuho Okued661b12022-04-30 21:43:31 +09001587 }
1588
1589 /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */
1590 if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001591 bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]);
1592 for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
1593 bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]);
1594 bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]);
Kazuho Okued661b12022-04-30 21:43:31 +09001595 }
1596
1597 /* append tag to encbuf */
Kazuho Oku680ce182022-05-09 15:53:18 +09001598 _mm_storeu_si128((void *)encp, gfmul_get_tag128(&gstate, bits5));
Kazuho Okue0caecc2022-05-04 10:09:24 +09001599 encp += 16;
Kazuho Okued661b12022-04-30 21:43:31 +09001600
Kazuho Oku680ce182022-05-09 15:53:18 +09001601 /* write remaining bytes */
1602 write_remaining_bytes(output, encbuf, encp);
1603
1604#undef AESECB6_INIT
1605#undef AESECB6_UPDATE
1606#undef AESECB6_FINAL
1607#undef STATE_EK0_READY
1608#undef STATE_COPY_128B
1609}
1610
Kazuho Oku2094f782022-05-10 22:13:00 +09001611static size_t non_temporal_decrypt128(ptls_aead_context_t *_ctx, void *_output, const void *_input, size_t inlen, uint64_t seq,
1612 const void *aad, size_t aadlen)
1613{
1614 /* Bail out if the input is too short, or remove tag from range. */
1615 if (inlen < 16)
1616 return SIZE_MAX;
1617 inlen -= 16;
1618 size_t textlen = inlen;
1619
1620/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
1621#define AESECB6_INIT() \
1622 do { \
1623 ctr = _mm_add_epi64(ctr, one8); \
1624 bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
1625 ctr = _mm_add_epi64(ctr, one8); \
1626 bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
1627 ctr = _mm_add_epi64(ctr, one8); \
1628 bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
1629 ctr = _mm_add_epi64(ctr, one8); \
1630 bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
1631 ctr = _mm_add_epi64(ctr, one8); \
1632 bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
1633 if (PTLS_LIKELY(inlen > 16 * 5)) { \
1634 ctr = _mm_add_epi64(ctr, one8); \
1635 bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
1636 } else { \
1637 bits5 = ek0; \
1638 state |= STATE_EK0_READY; \
1639 } \
1640 __m128i k = ctx->super.ecb.keys.m128[0]; \
1641 bits0 = _mm_xor_si128(bits0, k); \
1642 bits1 = _mm_xor_si128(bits1, k); \
1643 bits2 = _mm_xor_si128(bits2, k); \
1644 bits3 = _mm_xor_si128(bits3, k); \
1645 bits4 = _mm_xor_si128(bits4, k); \
1646 bits5 = _mm_xor_si128(bits5, k); \
1647 } while (0)
1648
1649/* aes block update */
1650#define AESECB6_UPDATE(i) \
1651 do { \
1652 __m128i k = ctx->super.ecb.keys.m128[i]; \
1653 bits0 = _mm_aesenc_si128(bits0, k); \
1654 bits1 = _mm_aesenc_si128(bits1, k); \
1655 bits2 = _mm_aesenc_si128(bits2, k); \
1656 bits3 = _mm_aesenc_si128(bits3, k); \
1657 bits4 = _mm_aesenc_si128(bits4, k); \
1658 bits5 = _mm_aesenc_si128(bits5, k); \
1659 } while (0)
1660
1661/* aesenclast */
1662#define AESECB6_FINAL(i) \
1663 do { \
1664 __m128i k = ctx->super.ecb.keys.m128[i]; \
1665 bits0 = _mm_aesenclast_si128(bits0, k); \
1666 bits1 = _mm_aesenclast_si128(bits1, k); \
1667 bits2 = _mm_aesenclast_si128(bits2, k); \
1668 bits3 = _mm_aesenclast_si128(bits3, k); \
1669 bits4 = _mm_aesenclast_si128(bits4, k); \
1670 bits5 = _mm_aesenclast_si128(bits5, k); \
1671 } while (0)
1672
1673 struct aesgcm_context *agctx = (void *)_ctx;
1674 uint8_t *output = _output;
1675 const uint8_t *input = _input;
1676
1677#define STATE_EK0_READY 0x1
1678 int32_t state = 0;
1679
1680 /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */
1681 __m128i ctr = calc_counter(agctx, seq);
1682 ctr = _mm_insert_epi32(ctr, 1, 0);
1683 __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128);
1684 __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
1685
1686 struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm;
1687 __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
1688 struct ptls_fusion_gfmul_state128 gstate = {0};
1689
1690 /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
1691 AESECB6_INIT();
1692 AESECB6_UPDATE(1);
1693 AESECB6_UPDATE(2);
1694 reduce_aad128(&gstate, ctx->ghash, aad, aadlen);
1695 for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
1696 AESECB6_UPDATE(i);
1697 AESECB6_FINAL(ctx->super.ecb.rounds);
1698
1699 /* Main loop. Operate in full blocks (6 * 16 bytes). */
1700 while (PTLS_LIKELY(inlen >= 6 * 16)) {
Kazuho Oku908f00a2022-05-11 08:52:16 +09001701#define DECRYPT(i) _mm_storeu_si128((void *)(output + i * 16), _mm_xor_si128(bits##i, _mm_loadu_si128((void *)(input + i * 16))))
1702 DECRYPT(0);
1703 DECRYPT(1);
1704 DECRYPT(2);
1705 DECRYPT(3);
1706 DECRYPT(4);
1707 DECRYPT(5);
1708#undef DECRYPT
1709#define GFMUL_NEXT(i) gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(input + i * 16)), ctx->ghash + 5 - i)
Kazuho Oku2094f782022-05-10 22:13:00 +09001710 AESECB6_INIT();
1711 AESECB6_UPDATE(1);
Kazuho Oku2094f782022-05-10 22:13:00 +09001712 AESECB6_UPDATE(2);
Kazuho Oku2094f782022-05-10 22:13:00 +09001713 AESECB6_UPDATE(3);
Kazuho Oku908f00a2022-05-11 08:52:16 +09001714 gfmul_firststep128(&gstate, _mm_loadu_si128((void *)input), ctx->ghash + 5);
Kazuho Oku2094f782022-05-10 22:13:00 +09001715 AESECB6_UPDATE(4);
Kazuho Oku908f00a2022-05-11 08:52:16 +09001716 GFMUL_NEXT(1);
Kazuho Oku2094f782022-05-10 22:13:00 +09001717 AESECB6_UPDATE(5);
Kazuho Oku908f00a2022-05-11 08:52:16 +09001718 GFMUL_NEXT(2);
Kazuho Oku2094f782022-05-10 22:13:00 +09001719 AESECB6_UPDATE(6);
Kazuho Oku908f00a2022-05-11 08:52:16 +09001720 GFMUL_NEXT(3);
Kazuho Oku2094f782022-05-10 22:13:00 +09001721 AESECB6_UPDATE(7);
Kazuho Oku908f00a2022-05-11 08:52:16 +09001722 GFMUL_NEXT(4);
Kazuho Oku2094f782022-05-10 22:13:00 +09001723 AESECB6_UPDATE(8);
Kazuho Oku908f00a2022-05-11 08:52:16 +09001724 GFMUL_NEXT(5);
Kazuho Oku2094f782022-05-10 22:13:00 +09001725 AESECB6_UPDATE(9);
Kazuho Oku908f00a2022-05-11 08:52:16 +09001726 gfmul_reduce128(&gstate);
Kazuho Oku2094f782022-05-10 22:13:00 +09001727 if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
1728 size_t i = 10;
1729 do {
1730 AESECB6_UPDATE(i);
1731 } while (++i < ctx->super.ecb.rounds);
1732 }
1733 AESECB6_FINAL(ctx->super.ecb.rounds);
1734 output += 6 * 16;
1735 input += 6 * 16;
1736 inlen -= 6 * 16;
Kazuho Oku908f00a2022-05-11 08:52:16 +09001737#undef GFMUL_NEXT
Kazuho Oku2094f782022-05-10 22:13:00 +09001738 }
1739
1740 /* Decrypt the remainder as well as finishing GHASH calculation. */
1741 if (inlen != 0) {
1742 struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (inlen + 15) / 16 + 1;
1743#define ONEBLOCK(i) \
1744 do { \
1745 if (inlen != 0) { \
1746 __m128i b = inlen >= 16 ? _mm_loadu_si128((void *)input) : loadn128(input, inlen); \
1747 if (i == 0) { \
1748 gfmul_firststep128(&gstate, b, --ghash_precompute); \
1749 } else { \
1750 gfmul_nextstep128(&gstate, b, --ghash_precompute); \
1751 } \
1752 b = _mm_xor_si128(b, bits##i); \
1753 if (inlen >= 16) { \
1754 _mm_storeu_si128((void *)output, b); \
1755 output += 16; \
1756 input += 16; \
1757 inlen -= 16; \
1758 } else { \
1759 storen128(output, inlen, b); \
1760 output += inlen; \
1761 input += inlen; \
1762 inlen = 0; \
1763 } \
1764 } \
1765 } while (0)
1766 ONEBLOCK(0);
1767 ONEBLOCK(1);
1768 ONEBLOCK(2);
1769 ONEBLOCK(3);
1770 ONEBLOCK(4);
1771 ONEBLOCK(5);
1772#undef ONEBLOCK
1773 gfmul_nextstep128(&gstate, ac, --ghash_precompute);
1774 assert(ghash_precompute == ctx->ghash);
1775 } else {
1776 gfmul_firststep128(&gstate, ac, ctx->ghash);
1777 }
1778 gfmul_reduce128(&gstate);
1779
1780 /* Calculate EK0 if not yet available in bits5. */
1781 if ((state & STATE_EK0_READY) == 0) {
1782 bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]);
1783 for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
1784 bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]);
1785 bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]);
1786 }
1787
1788 /* Calculate GCM tag and compare. */
1789 __m128i calctag = gfmul_get_tag128(&gstate, bits5);
1790 __m128i recvtag = _mm_loadu_si128((void *)input);
1791 if (_mm_movemask_epi8(_mm_cmpeq_epi8(calctag, recvtag)) != 0xffff)
1792 return SIZE_MAX;
1793
1794 return textlen;
1795
1796#undef AESECB6_INIT
1797#undef AESECB6_UPDATE
1798#undef AESECB6_FINAL
1799#undef STATE_EK0_READY
1800}
1801
Kazuho Oku680ce182022-05-09 15:53:18 +09001802NO_SANITIZE_ADDRESS
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09001803static void non_temporal_encrypt_v256(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt,
1804 uint64_t seq, const void *_aad, size_t aadlen)
Kazuho Oku680ce182022-05-09 15:53:18 +09001805{
1806/* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
1807#define AESECB6_INIT() \
1808 do { \
1809 ctr = _mm256_add_epi64(ctr, incr128x2); \
1810 bits0 = _mm256_shuffle_epi8(ctr, byteswap256); \
1811 ctr = _mm256_add_epi64(ctr, incr128x2); \
1812 bits1 = _mm256_shuffle_epi8(ctr, byteswap256); \
1813 ctr = _mm256_add_epi64(ctr, incr128x2); \
1814 bits2 = _mm256_shuffle_epi8(ctr, byteswap256); \
1815 ctr = _mm256_add_epi64(ctr, incr128x2); \
1816 bits3 = _mm256_shuffle_epi8(ctr, byteswap256); \
1817 ctr = _mm256_add_epi64(ctr, incr128x2); \
1818 bits4 = _mm256_shuffle_epi8(ctr, byteswap256); \
1819 ctr = _mm256_add_epi64(ctr, incr128x2); \
1820 bits5 = _mm256_shuffle_epi8(ctr, byteswap256); \
1821 if (PTLS_UNLIKELY(srclen <= 32 * 6 - 16) && src_vecleft == 0) { \
Kazuho Okufd7d5c12022-05-09 21:37:38 +09001822 bits5 = _mm256_permute2f128_si256(bits5, ac_ek0, 0x30); \
Kazuho Oku680ce182022-05-09 15:53:18 +09001823 state |= STATE_EK0_READY; \
1824 } \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001825 __m256i k = ctx->super.ecb.keys.m256[0]; \
Kazuho Oku680ce182022-05-09 15:53:18 +09001826 bits0 = _mm256_xor_si256(bits0, k); \
1827 bits1 = _mm256_xor_si256(bits1, k); \
1828 bits2 = _mm256_xor_si256(bits2, k); \
1829 bits3 = _mm256_xor_si256(bits3, k); \
1830 bits4 = _mm256_xor_si256(bits4, k); \
1831 bits5 = _mm256_xor_si256(bits5, k); \
1832 } while (0)
1833
1834/* aes block update */
1835#define AESECB6_UPDATE(i) \
1836 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001837 __m256i k = ctx->super.ecb.keys.m256[i]; \
Kazuho Oku680ce182022-05-09 15:53:18 +09001838 bits0 = _mm256_aesenc_epi128(bits0, k); \
1839 bits1 = _mm256_aesenc_epi128(bits1, k); \
1840 bits2 = _mm256_aesenc_epi128(bits2, k); \
1841 bits3 = _mm256_aesenc_epi128(bits3, k); \
1842 bits4 = _mm256_aesenc_epi128(bits4, k); \
1843 bits5 = _mm256_aesenc_epi128(bits5, k); \
1844 } while (0)
1845
1846/* aesenclast */
1847#define AESECB6_FINAL(i) \
1848 do { \
Kazuho Oku07f37c22022-05-10 09:45:42 +09001849 __m256i k = ctx->super.ecb.keys.m256[i]; \
Kazuho Oku680ce182022-05-09 15:53:18 +09001850 bits0 = _mm256_aesenclast_epi128(bits0, k); \
1851 bits1 = _mm256_aesenclast_epi128(bits1, k); \
1852 bits2 = _mm256_aesenclast_epi128(bits2, k); \
1853 bits3 = _mm256_aesenclast_epi128(bits3, k); \
1854 bits4 = _mm256_aesenclast_epi128(bits4, k); \
1855 bits5 = _mm256_aesenclast_epi128(bits5, k); \
1856 } while (0)
1857
1858 struct aesgcm_context *agctx = (void *)_ctx;
1859 uint8_t *output = _output;
1860 const uint8_t *aad = _aad;
1861
1862#define STATE_EK0_READY 0x1
1863 int32_t state = 0;
1864
1865 /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */
1866 uint8_t encbuf[32 * 9] __attribute__((aligned(32))), *encp;
1867
1868 /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is
1869 * append to the ciphertext before writing the bytes to main memory using NT store instructions. */
1870 PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 32 + 16);
1871
Kazuho Oku13ced822022-05-10 10:09:54 +09001872 /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
1873 encp = load_preceding_unaligned(encbuf, &output);
Kazuho Oku680ce182022-05-09 15:53:18 +09001874
1875 /* setup ctr, retaining Ek(0), len(A) | len(C) to be fed into GCM */
1876 __m256i ctr = _mm256_broadcastsi128_si256(calc_counter(agctx, seq));
1877 ctr = _mm256_insert_epi32(ctr, 1, 4);
Kazuho Okufd7d5c12022-05-09 21:37:38 +09001878 __m256i ac_ek0 = _mm256_permute2f128_si256(
1879 /* first half: ac */
1880 _mm256_castsi128_si256(
1881 _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128)),
1882 /* second half: ek0 */
1883 _mm256_shuffle_epi8(ctr, byteswap256), 0x30);
Kazuho Oku680ce182022-05-09 15:53:18 +09001884
Kazuho Oku07f37c22022-05-10 09:45:42 +09001885 struct ptls_fusion_aesgcm_context256 *ctx = (void *)agctx->aesgcm;
Kazuho Oku680ce182022-05-09 15:53:18 +09001886 __m256i bits0, bits1, bits2, bits3, bits4, bits5 = _mm256_setzero_si256();
1887 struct ptls_fusion_gfmul_state256 gstate = {0};
1888
1889 /* find the first non-empty vec */
1890 const uint8_t *src = NULL;
1891 size_t srclen = 0, src_vecleft = incnt;
1892 while (srclen == 0 && src_vecleft != 0) {
1893 src = (void *)input[0].base;
1894 srclen = input[0].len;
1895 ++input;
1896 --src_vecleft;
1897 }
1898
1899 /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
1900 AESECB6_INIT();
1901 AESECB6_UPDATE(1);
1902 AESECB6_UPDATE(2);
1903 if (PTLS_LIKELY(aadlen != 0)) {
1904 union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute;
1905 while (PTLS_UNLIKELY(aadlen >= 6 * 32)) {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001906 ghash_precompute = ctx->ghash + 6;
Kazuho Oku680ce182022-05-09 15:53:18 +09001907 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute);
1908 aad += 32;
1909 aadlen -= 32;
1910 for (int i = 1; i < 6; ++i) {
1911 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute);
1912 aad += 32;
1913 aadlen -= 32;
1914 }
1915 gfmul_reduce256(&gstate);
Kazuho Okued661b12022-04-30 21:43:31 +09001916 }
Kazuho Oku680ce182022-05-09 15:53:18 +09001917 if (PTLS_LIKELY(aadlen != 0)) {
Kazuho Oku07f37c22022-05-10 09:45:42 +09001918 ghash_precompute = ctx->ghash + (aadlen + 31) / 32;
Kazuho Oku680ce182022-05-09 15:53:18 +09001919 if (PTLS_UNLIKELY(aadlen >= 32)) {
1920 if (aadlen % 32 == 0 || aadlen % 32 > 16) {
1921 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute);
1922 aad += 32;
1923 aadlen -= 32;
1924 } else {
1925 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 1, --ghash_precompute);
1926 aad += 16;
1927 aadlen -= 16;
1928 }
1929 while (aadlen >= 32) {
1930 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute);
1931 aad += 32;
1932 aadlen -= 32;
1933 }
1934 if (PTLS_LIKELY(aadlen != 0)) {
1935 assert(aadlen > 16);
1936 gfmul_nextstep256(&gstate, loadn256(aad, aadlen), --ghash_precompute);
1937 }
1938 } else {
1939 gfmul_firststep256(&gstate, loadn256(aad, aadlen), aadlen <= 16, --ghash_precompute);
1940 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001941 assert(ctx->ghash == ghash_precompute);
Kazuho Oku680ce182022-05-09 15:53:18 +09001942 gfmul_reduce256(&gstate);
Kazuho Okued661b12022-04-30 21:43:31 +09001943 }
1944 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09001945 for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
Kazuho Oku680ce182022-05-09 15:53:18 +09001946 AESECB6_UPDATE(i);
Kazuho Oku07f37c22022-05-10 09:45:42 +09001947 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Oku680ce182022-05-09 15:53:18 +09001948
1949 /* Main loop. This loop:
1950 * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf,
1951 * 2. then if there is no more data to be encrypted, exit the loop, otherwise,
1952 * 3. calculate ghash of the blocks being written to encbuf,
1953 * 4. calculate next 6 * 16 bytes of keystream,
1954 * 5. writes encbuf in 64-byte blocks
1955 * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be
1956 * calculated. */
1957 size_t remaining_ghash_from = encp - encbuf;
1958 if (srclen != 0) {
1959 while (1) {
1960 /* apply the bit stream to input, writing to encbuf */
1961 if (PTLS_LIKELY(srclen >= 6 * 32)) {
1962#define APPLY(i) _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(src + i * 32)), bits##i))
1963 APPLY(0);
1964 APPLY(1);
1965 APPLY(2);
1966 APPLY(3);
1967 APPLY(4);
1968 APPLY(5);
1969#undef APPLY
1970 encp += 6 * 32;
1971 src += 6 * 32;
1972 srclen -= 6 * 32;
1973 if (PTLS_UNLIKELY(srclen == 0)) {
1974 if (src_vecleft == 0) {
1975 remaining_ghash_from = (encp - encbuf) - 6 * 32;
1976 break;
1977 }
1978 src = (void *)input[0].base;
1979 srclen = input[0].len;
1980 ++input;
1981 --src_vecleft;
1982 }
1983 } else {
1984 /* slow path, load at most 6 * 32 bytes to encbuf then encrypt in-place */
1985 size_t bytes_copied = 0;
1986 do {
1987 if (srclen >= 32 && bytes_copied < 5 * 32) {
1988 _mm256_storeu_si256((void *)(encp + bytes_copied), _mm256_loadu_si256((void *)src));
1989 bytes_copied += 32;
1990 src += 32;
1991 srclen -= 32;
1992 } else {
1993 encp[bytes_copied++] = *src++;
1994 --srclen;
1995 }
1996 if (PTLS_UNLIKELY(srclen == 0)) {
Kazuho Oku629b8002022-06-29 12:31:35 +09001997 do {
1998 if (src_vecleft == 0)
1999 break;
Kazuho Oku680ce182022-05-09 15:53:18 +09002000 src = (void *)input[0].base;
2001 srclen = input[0].len;
2002 ++input;
2003 --src_vecleft;
Kazuho Oku629b8002022-06-29 12:31:35 +09002004 } while (srclen == 0);
2005 if (srclen == 0)
2006 break;
Kazuho Oku680ce182022-05-09 15:53:18 +09002007 }
2008 } while (bytes_copied < 6 * 32);
2009#define APPLY(i) \
2010 _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(encp + i * 32)), bits##i))
2011 APPLY(0);
2012 APPLY(1);
2013 APPLY(2);
2014 APPLY(3);
2015 APPLY(4);
2016 APPLY(5);
2017#undef APPLY
2018 encp += bytes_copied;
2019 if (PTLS_UNLIKELY(srclen == 0)) {
2020 /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it
2021 * will be fed into ghash. */
2022 remaining_ghash_from = (encp - encbuf) - bytes_copied;
2023 if ((bytes_copied & 15) != 0)
2024 _mm_storeu_si128((void *)encp, _mm_setzero_si128());
2025 break;
2026 }
2027 }
2028
2029 /* Next 96-byte block starts here. Run AES and ghash in parallel while writing output using non-temporal store
2030 * instructions. */
2031 AESECB6_INIT();
Kazuho Oku07f37c22022-05-10 09:45:42 +09002032 union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + 6;
Kazuho Oku680ce182022-05-09 15:53:18 +09002033 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encp - 6 * 32)), 0, --ghash_precompute);
2034 AESECB6_UPDATE(1);
2035 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 5 * 32)), --ghash_precompute);
2036 AESECB6_UPDATE(2);
2037 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 4 * 32)), --ghash_precompute);
2038 AESECB6_UPDATE(3);
2039 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 3 * 32)), --ghash_precompute);
2040 AESECB6_UPDATE(4);
2041 _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf));
2042 _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32)));
2043 _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64)));
2044 _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96)));
2045 _mm256_stream_si256((void *)(output + 128), _mm256_load_si256((void *)(encbuf + 128)));
2046 _mm256_stream_si256((void *)(output + 160), _mm256_load_si256((void *)(encbuf + 160)));
2047 AESECB6_UPDATE(5);
2048 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 2 * 32)), --ghash_precompute);
2049 AESECB6_UPDATE(6);
2050 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 1 * 32)), --ghash_precompute);
2051 output += 192;
2052 encp -= 192;
2053 AESECB6_UPDATE(7);
2054 _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 192)));
2055 AESECB6_UPDATE(8);
2056 _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 224)));
2057 AESECB6_UPDATE(9);
Kazuho Oku07f37c22022-05-10 09:45:42 +09002058 if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
2059 for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i)
Kazuho Oku680ce182022-05-09 15:53:18 +09002060 AESECB6_UPDATE(i);
2061 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09002062 assert(ctx->ghash == ghash_precompute);
Kazuho Oku680ce182022-05-09 15:53:18 +09002063 gfmul_reduce256(&gstate);
Kazuho Oku07f37c22022-05-10 09:45:42 +09002064 AESECB6_FINAL(ctx->super.ecb.rounds);
Kazuho Oku680ce182022-05-09 15:53:18 +09002065 }
2066 }
2067
2068 /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */
2069
2070 { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7
2071 * blocks at once. */
2072 size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16;
Kazuho Okufd7d5c12022-05-09 21:37:38 +09002073 _mm_storeu_si128((void *)(encbuf + ac_off), _mm256_castsi256_si128(ac_ek0));
Kazuho Oku680ce182022-05-09 15:53:18 +09002074 size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */
2075 assert(blocks <= 13);
Kazuho Oku07f37c22022-05-10 09:45:42 +09002076 union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + blocks / 2;
Kazuho Oku680ce182022-05-09 15:53:18 +09002077 if (blocks % 2 != 0) {
2078 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 1, ghash_precompute);
2079 remaining_ghash_from += 16;
2080 } else {
2081 gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 0, --ghash_precompute);
2082 remaining_ghash_from += 32;
2083 }
Kazuho Oku07f37c22022-05-10 09:45:42 +09002084 while (ghash_precompute != ctx->ghash) {
Kazuho Oku680ce182022-05-09 15:53:18 +09002085 gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
2086 remaining_ghash_from += 32;
2087 }
2088 gfmul_reduce256(&gstate);
2089 }
2090
2091 /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */
2092 if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) {
Kazuho Okufd7d5c12022-05-09 21:37:38 +09002093 bits5 = ac_ek0;
Kazuho Oku07f37c22022-05-10 09:45:42 +09002094 bits5 = _mm256_xor_si256(bits5, ctx->super.ecb.keys.m256[0]);
2095 for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
2096 bits5 = _mm256_aesenc_epi128(bits5, ctx->super.ecb.keys.m256[i]);
2097 bits5 = _mm256_aesenclast_epi128(bits5, ctx->super.ecb.keys.m256[ctx->super.ecb.rounds]);
Kazuho Oku680ce182022-05-09 15:53:18 +09002098 }
2099
2100 /* append tag to encbuf */
2101 _mm_storeu_si128((void *)encp,
2102 gfmul_get_tag256(&gstate, _mm256_castsi256_si128(_mm256_permute2f128_si256(bits5, bits5, 0x11))));
2103 encp += 16;
2104
2105 /* write remaining bytes */
2106 write_remaining_bytes(output, encbuf, encp);
Kazuho Okued661b12022-04-30 21:43:31 +09002107}
2108
Kazuho Oku2094f782022-05-10 22:13:00 +09002109static int non_temporal_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size)
Kazuho Okued661b12022-04-30 21:43:31 +09002110{
2111 struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
Kazuho Oku65d3e792022-06-29 12:04:02 +09002112 int aesni256 = is_enc && ptls_fusion_can_aesni256;
Kazuho Okued661b12022-04-30 21:43:31 +09002113
Kazuho Oku680ce182022-05-09 15:53:18 +09002114 ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
2115 ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
Kazuho Okued661b12022-04-30 21:43:31 +09002116 if (key == NULL)
2117 return 0;
2118
2119 ctx->super.dispose_crypto = aesgcm_dispose_crypto;
Kazuho Okubb2cac22023-02-13 16:09:08 +09002120 ctx->super.do_get_iv = aesgcm_get_iv;
2121 ctx->super.do_set_iv = aesgcm_set_iv;
Kazuho Oku2094f782022-05-10 22:13:00 +09002122 ctx->super.do_encrypt_init = NULL;
2123 ctx->super.do_encrypt_update = NULL;
2124 ctx->super.do_encrypt_final = NULL;
2125 if (is_enc) {
2126 ctx->super.do_encrypt = ptls_aead__do_encrypt;
Kazuho Oku65d3e792022-06-29 12:04:02 +09002127 ctx->super.do_encrypt_v = aesni256 ? non_temporal_encrypt_v256 : non_temporal_encrypt_v128;
Kazuho Oku2094f782022-05-10 22:13:00 +09002128 ctx->super.do_decrypt = NULL;
2129 } else {
Kazuho Oku65d3e792022-06-29 12:04:02 +09002130 assert(!aesni256);
Kazuho Oku2094f782022-05-10 22:13:00 +09002131 ctx->super.do_encrypt = NULL;
2132 ctx->super.do_encrypt_v = NULL;
2133 ctx->super.do_decrypt = non_temporal_decrypt128;
2134 }
Kazuho Okued661b12022-04-30 21:43:31 +09002135
Kazuho Oku65d3e792022-06-29 12:04:02 +09002136 ctx->aesgcm =
2137 new_aesgcm(key, key_size,
2138 7 * (ptls_fusion_can_aesni256 ? 32 : 16), // 6 blocks at once, plus len(A) | len(C) that we might append
2139 aesni256);
Kazuho Okued661b12022-04-30 21:43:31 +09002140
2141 return 0;
2142}
2143
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09002144static int non_temporal_aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
Kazuho Okued661b12022-04-30 21:43:31 +09002145{
Kazuho Oku2094f782022-05-10 22:13:00 +09002146 return non_temporal_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE);
Kazuho Okued661b12022-04-30 21:43:31 +09002147}
2148
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09002149static int non_temporal_aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
Kazuho Okued661b12022-04-30 21:43:31 +09002150{
Kazuho Oku2094f782022-05-10 22:13:00 +09002151 return non_temporal_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE);
Kazuho Okued661b12022-04-30 21:43:31 +09002152}
2153
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09002154ptls_aead_algorithm_t ptls_non_temporal_aes128gcm = {"AES128-GCM",
2155 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
2156 PTLS_AESGCM_INTEGRITY_LIMIT,
2157 &ptls_fusion_aes128ctr,
2158 NULL, // &ptls_fusion_aes128ecb,
2159 PTLS_AES128_KEY_SIZE,
2160 PTLS_AESGCM_IV_SIZE,
2161 PTLS_AESGCM_TAG_SIZE,
Kazuho Oku93944ce2022-07-06 16:41:08 +09002162 {PTLS_TLS12_AESGCM_FIXED_IV_SIZE, PTLS_TLS12_AESGCM_RECORD_IV_SIZE},
Kazuho Oku9dc69822022-06-08 13:25:52 +09002163 1,
Kazuho Oku1edf7072022-07-11 10:30:43 +09002164 PTLS_X86_CACHE_LINE_ALIGN_BITS,
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09002165 sizeof(struct aesgcm_context),
2166 non_temporal_aes128gcm_setup};
2167ptls_aead_algorithm_t ptls_non_temporal_aes256gcm = {"AES256-GCM",
2168 PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
2169 PTLS_AESGCM_INTEGRITY_LIMIT,
2170 &ptls_fusion_aes256ctr,
2171 NULL, // &ptls_fusion_aes128ecb,
2172 PTLS_AES256_KEY_SIZE,
2173 PTLS_AESGCM_IV_SIZE,
2174 PTLS_AESGCM_TAG_SIZE,
Kazuho Oku93944ce2022-07-06 16:41:08 +09002175 {PTLS_TLS12_AESGCM_FIXED_IV_SIZE, PTLS_TLS12_AESGCM_RECORD_IV_SIZE},
Kazuho Oku9dc69822022-06-08 13:25:52 +09002176 1,
Kazuho Oku1edf7072022-07-11 10:30:43 +09002177 PTLS_X86_CACHE_LINE_ALIGN_BITS,
Kazuho Oku34e9b2d2022-05-09 17:04:02 +09002178 sizeof(struct aesgcm_context),
2179 non_temporal_aes256gcm_setup};
Kazuho Okued661b12022-04-30 21:43:31 +09002180
Christian Huitema3c3e3f22020-06-23 15:49:15 -07002181#ifdef _WINDOWS
Christian Huitema81605432020-06-23 15:38:36 -07002182/**
2183 * ptls_fusion_is_supported_by_cpu:
2184 * Check that the CPU has extended instructions for PCMUL, AES and AVX2.
2185 * This test assumes that the CPU is following the x86/x64 architecture.
2186 * A slightly more refined test could check that the cpu_info spells out
2187 * "genuineIntel" or "authenticAMD", but would fail in presence of
2188 * little known CPU brands or some VM */
Christian Huitemac17ef182020-06-22 20:41:45 -07002189int ptls_fusion_is_supported_by_cpu(void)
2190{
Christian Huitema81605432020-06-23 15:38:36 -07002191 uint32_t cpu_info[4];
2192 uint32_t nb_ids;
2193 int is_supported = 0;
2194
2195 __cpuid(cpu_info, 0);
2196 nb_ids = cpu_info[0];
2197
2198 if (nb_ids >= 7) {
2199 uint32_t leaf1_ecx;
2200 __cpuid(cpu_info, 1);
2201 leaf1_ecx = cpu_info[2];
Kazuho Oku14c00c02020-09-12 20:48:25 +09002202
Christian Huitema81605432020-06-23 15:38:36 -07002203 if (/* PCLMUL */ (leaf1_ecx & (1 << 5)) != 0 && /* AES */ (leaf1_ecx & (1 << 25)) != 0) {
Kazuho Oku680ce182022-05-09 15:53:18 +09002204 uint32_t leaf7_ebx, leaf7_ecx;
Christian Huitema81605432020-06-23 15:38:36 -07002205 __cpuid(cpu_info, 7);
2206 leaf7_ebx = cpu_info[1];
Kazuho Oku680ce182022-05-09 15:53:18 +09002207 leaf7_ecx = cpu_info[2];
Christian Huitema81605432020-06-23 15:38:36 -07002208
2209 is_supported = /* AVX2 */ (leaf7_ebx & (1 << 5)) != 0;
Kazuho Oku680ce182022-05-09 15:53:18 +09002210
2211 /* enable 256-bit mode if possible */
Kazuho Oku65d3e792022-06-29 12:04:02 +09002212 if (is_supported && (leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_aesni256)
2213 ptls_fusion_can_aesni256 = 1;
Christian Huitema81605432020-06-23 15:38:36 -07002214 }
2215 }
2216
2217 return is_supported;
Christian Huitemac17ef182020-06-22 20:41:45 -07002218}
2219#else
Kazuho Oku3ee790b2020-05-15 03:35:03 +09002220int ptls_fusion_is_supported_by_cpu(void)
2221{
Kazuho Oku680ce182022-05-09 15:53:18 +09002222 unsigned leaf1_ecx, leaf7_ebx, leaf7_ecx;
Kazuho Oku3604f8b2020-05-15 04:24:27 +09002223
2224 { /* GCC-specific code to obtain CPU features */
Kazuho Okuefce0432020-05-15 04:51:58 +09002225 unsigned leaf_cnt;
2226 __asm__("cpuid" : "=a"(leaf_cnt) : "a"(0) : "ebx", "ecx", "edx");
2227 if (leaf_cnt < 7)
Kazuho Oku3604f8b2020-05-15 04:24:27 +09002228 return 0;
Kazuho Okuefce0432020-05-15 04:51:58 +09002229 __asm__("cpuid" : "=c"(leaf1_ecx) : "a"(1) : "ebx", "edx");
Kazuho Oku680ce182022-05-09 15:53:18 +09002230 __asm__("cpuid" : "=b"(leaf7_ebx), "=c"(leaf7_ecx) : "a"(7), "c"(0) : "edx");
Kazuho Oku3604f8b2020-05-15 04:24:27 +09002231 }
2232
Kazuho Oku3604f8b2020-05-15 04:24:27 +09002233 /* AVX2 */
2234 if ((leaf7_ebx & (1 << 5)) == 0)
Kazuho Oku3ee790b2020-05-15 03:35:03 +09002235 return 0;
Kazuho Oku3604f8b2020-05-15 04:24:27 +09002236 /* AES */
2237 if ((leaf1_ecx & (1 << 25)) == 0)
2238 return 0;
2239 /* PCLMUL */
2240 if ((leaf1_ecx & (1 << 1)) == 0)
2241 return 0;
2242
Kazuho Oku680ce182022-05-09 15:53:18 +09002243 /* enable 256-bit mode if possible */
Kazuho Oku65d3e792022-06-29 12:04:02 +09002244 if ((leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_aesni256)
2245 ptls_fusion_can_aesni256 = 1;
Kazuho Oku680ce182022-05-09 15:53:18 +09002246
Kazuho Oku3ee790b2020-05-15 03:35:03 +09002247 return 1;
2248}
Christian Huitemac17ef182020-06-22 20:41:45 -07002249#endif