Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1 | /* |
| 2 | * This source file is licensed under the Apache License 2.0 *and* the MIT |
| 3 | * License. Please agree to *both* of the licensing terms! |
| 4 | * |
| 5 | * |
| 6 | * `transformH` function is a derivative work of OpenSSL. The original work |
| 7 | * is covered by the following license: |
| 8 | * |
| 9 | * Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 10 | * |
| 11 | * Licensed under the Apache License 2.0 (the "License"). You may not use |
| 12 | * this file except in compliance with the License. You can obtain a copy |
| 13 | * in the file LICENSE in the source distribution or at |
| 14 | * https://www.openssl.org/source/license.html |
| 15 | * |
| 16 | * |
| 17 | * All other work, including modifications to the `transformH` function is |
| 18 | * covered by the following MIT license: |
| 19 | * |
Kazuho Oku | d1a0912 | 2022-05-09 16:06:36 +0900 | [diff] [blame] | 20 | * Copyright (c) 2020-2022 Fastly, Kazuho Oku |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 21 | * |
| 22 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 23 | * of this software and associated documentation files (the "Software"), to |
| 24 | * deal in the Software without restriction, including without limitation the |
| 25 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 26 | * sell copies of the Software, and to permit persons to whom the Software is |
| 27 | * furnished to do so, subject to the following conditions: |
| 28 | * |
| 29 | * The above copyright notice and this permission notice shall be included in |
| 30 | * all copies or substantial portions of the Software. |
| 31 | * |
| 32 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 33 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 34 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 35 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 36 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 37 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| 38 | * IN THE SOFTWARE. |
| 39 | */ |
| 40 | #include <stdint.h> |
Kazuho Oku | 14c00c0 | 2020-09-12 20:48:25 +0900 | [diff] [blame] | 41 | |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 42 | #include <stdlib.h> |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 43 | #include <string.h> |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 44 | #include <immintrin.h> |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 45 | #include <tmmintrin.h> |
Kazuho Oku | 02ca0f0 | 2020-05-13 20:46:39 +0900 | [diff] [blame] | 46 | #include <nmmintrin.h> |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 47 | #include <wmmintrin.h> |
| 48 | #include "picotls.h" |
| 49 | #include "picotls/fusion.h" |
| 50 | |
Kazuho Oku | 8b9cd57 | 2022-05-04 13:44:33 +0900 | [diff] [blame] | 51 | #if defined(__clang__) |
| 52 | #if __has_feature(address_sanitizer) |
| 53 | #define NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address"))) |
| 54 | #endif |
| 55 | #elif __SANITIZE_ADDRESS__ /* gcc */ |
| 56 | #define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address)) |
| 57 | #endif |
| 58 | #ifndef NO_SANITIZE_ADDRESS |
| 59 | #define NO_SANITIZE_ADDRESS |
| 60 | #endif |
| 61 | |
Kazuho Oku | eb3ebf4 | 2022-05-10 07:06:17 +0900 | [diff] [blame] | 62 | #ifdef _WINDOWS |
| 63 | #define aligned_alloc(a, s) _aligned_malloc((s), (a)) |
| 64 | #endif |
| 65 | |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 66 | struct ptls_fusion_aesgcm_context { |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 67 | ptls_fusion_aesecb_context_t ecb; |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 68 | size_t capacity; |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 69 | size_t ghash_cnt; |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 70 | }; |
| 71 | |
| 72 | struct ptls_fusion_aesgcm_context128 { |
| 73 | struct ptls_fusion_aesgcm_context super; |
| 74 | struct ptls_fusion_aesgcm_ghash_precompute128 { |
| 75 | __m128i H; |
| 76 | __m128i r; |
| 77 | } ghash[0]; |
| 78 | }; |
| 79 | |
| 80 | struct ptls_fusion_aesgcm_context256 { |
| 81 | struct ptls_fusion_aesgcm_context super; |
| 82 | union ptls_fusion_aesgcm_ghash_precompute256 { |
| 83 | struct { |
| 84 | __m128i H[2]; |
| 85 | __m128i r[2]; |
| 86 | }; |
| 87 | struct { |
| 88 | __m256i Hx2; |
| 89 | __m256i rx2; |
| 90 | }; |
| 91 | } ghash[0]; |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 92 | }; |
| 93 | |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 94 | struct ctr_context { |
| 95 | ptls_cipher_context_t super; |
| 96 | ptls_fusion_aesecb_context_t fusion; |
| 97 | __m128i bits; |
| 98 | uint8_t is_ready; |
| 99 | }; |
| 100 | |
| 101 | struct aesgcm_context { |
| 102 | ptls_aead_context_t super; |
| 103 | ptls_fusion_aesgcm_context_t *aesgcm; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 104 | /** |
| 105 | * retains the static IV in the upper 96 bits (in little endian) |
| 106 | */ |
| 107 | __m128i static_iv; |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 108 | }; |
| 109 | |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 110 | static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000}; |
| 111 | #define poly (*(__m128i *)poly_) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 112 | static const uint8_t byteswap_[32] __attribute__((aligned(32))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, |
| 113 | 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; |
| 114 | #define byteswap128 (*(__m128i *)byteswap_) |
| 115 | #define byteswap256 (*(__m256i *)byteswap_) |
| 116 | static const uint8_t one_[16] __attribute__((aligned(16))) = {1}; |
| 117 | #define one8 (*(__m128i *)one_) |
| 118 | static const uint8_t incr128x2_[32] __attribute__((aligned(32))) = {2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}; |
| 119 | #define incr128x2 (*(__m256i *)incr128x2_) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 120 | |
Kazuho Oku | ae95e4c | 2020-05-11 06:27:27 +0900 | [diff] [blame] | 121 | /* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl |
| 122 | * at commit 33388b4. */ |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 123 | static __m128i transformH(__m128i H) |
| 124 | { |
| 125 | // # <<1 twist |
| 126 | // pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword |
| 127 | __m128i t2 = _mm_shuffle_epi32(H, 0xff); |
| 128 | // movdqa $Hkey,$T1 |
| 129 | __m128i t1 = H; |
| 130 | // psllq \$1,$Hkey |
| 131 | H = _mm_slli_epi64(H, 1); |
| 132 | // pxor $T3,$T3 # |
| 133 | __m128i t3 = _mm_setzero_si128(); |
| 134 | // psrlq \$63,$T1 |
| 135 | t1 = _mm_srli_epi64(t1, 63); |
| 136 | // pcmpgtd $T2,$T3 # broadcast carry bit |
| 137 | t3 = _mm_cmplt_epi32(t2, t3); |
| 138 | // pslldq \$8,$T1 |
| 139 | t1 = _mm_slli_si128(t1, 8); |
| 140 | // por $T1,$Hkey # H<<=1 |
| 141 | H = _mm_or_si128(t1, H); |
| 142 | |
| 143 | // # magic reduction |
| 144 | // pand .L0x1c2_polynomial(%rip),$T3 |
| 145 | t3 = _mm_and_si128(t3, poly); |
| 146 | // pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial |
| 147 | H = _mm_xor_si128(t3, H); |
| 148 | |
| 149 | return H; |
| 150 | } |
| 151 | // end of Apache License code |
| 152 | |
| 153 | static __m128i gfmul(__m128i x, __m128i y) |
| 154 | { |
| 155 | __m128i lo = _mm_clmulepi64_si128(x, y, 0x00); |
| 156 | __m128i hi = _mm_clmulepi64_si128(x, y, 0x11); |
| 157 | |
| 158 | __m128i a = _mm_shuffle_epi32(x, 78); |
| 159 | __m128i b = _mm_shuffle_epi32(y, 78); |
| 160 | a = _mm_xor_si128(a, x); |
| 161 | b = _mm_xor_si128(b, y); |
| 162 | |
| 163 | a = _mm_clmulepi64_si128(a, b, 0x00); |
| 164 | a = _mm_xor_si128(a, lo); |
| 165 | a = _mm_xor_si128(a, hi); |
| 166 | |
| 167 | b = _mm_slli_si128(a, 8); |
| 168 | a = _mm_srli_si128(a, 8); |
| 169 | |
| 170 | lo = _mm_xor_si128(lo, b); |
| 171 | hi = _mm_xor_si128(hi, a); |
| 172 | |
| 173 | // from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf |
| 174 | __m128i t = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 175 | lo = _mm_shuffle_epi32(lo, 78); |
| 176 | lo = _mm_xor_si128(lo, t); |
| 177 | t = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 178 | lo = _mm_shuffle_epi32(lo, 78); |
| 179 | lo = _mm_xor_si128(lo, t); |
| 180 | |
| 181 | return _mm_xor_si128(hi, lo); |
| 182 | } |
| 183 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 184 | static inline __m128i gfmul_do_reduce(__m128i hi, __m128i lo, __m128i mid) |
| 185 | { |
| 186 | mid = _mm_xor_si128(mid, hi); |
| 187 | mid = _mm_xor_si128(mid, lo); |
| 188 | lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); |
| 189 | hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); |
| 190 | |
| 191 | /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ |
| 192 | __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 193 | lo = _mm_shuffle_epi32(lo, 78); |
| 194 | lo = _mm_xor_si128(lo, r); |
| 195 | r = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 196 | lo = _mm_shuffle_epi32(lo, 78); |
| 197 | lo = _mm_xor_si128(lo, r); |
| 198 | lo = _mm_xor_si128(hi, lo); |
| 199 | |
| 200 | return lo; |
| 201 | } |
| 202 | |
| 203 | struct ptls_fusion_gfmul_state128 { |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 204 | __m128i hi, lo, mid; |
| 205 | }; |
| 206 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 207 | static inline void gfmul_do_step128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X, |
| 208 | struct ptls_fusion_aesgcm_ghash_precompute128 *precompute) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 209 | { |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 210 | __m128i t = _mm_clmulepi64_si128(precompute->H, X, 0x00); |
| 211 | gstate->lo = _mm_xor_si128(gstate->lo, t); |
| 212 | t = _mm_clmulepi64_si128(precompute->H, X, 0x11); |
| 213 | gstate->hi = _mm_xor_si128(gstate->hi, t); |
| 214 | t = _mm_shuffle_epi32(X, 78); |
| 215 | t = _mm_xor_si128(t, X); |
| 216 | t = _mm_clmulepi64_si128(precompute->r, t, 0x00); |
| 217 | gstate->mid = _mm_xor_si128(gstate->mid, t); |
| 218 | } |
| 219 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 220 | static inline void gfmul_firststep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X, |
| 221 | struct ptls_fusion_aesgcm_ghash_precompute128 *precompute) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 222 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 223 | X = _mm_shuffle_epi8(X, byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 224 | X = _mm_xor_si128(gstate->lo, X); |
| 225 | gstate->lo = _mm_setzero_si128(); |
| 226 | gstate->hi = _mm_setzero_si128(); |
| 227 | gstate->mid = _mm_setzero_si128(); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 228 | gfmul_do_step128(gstate, X, precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 229 | } |
| 230 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 231 | static inline void gfmul_nextstep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X, |
| 232 | struct ptls_fusion_aesgcm_ghash_precompute128 *precompute) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 233 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 234 | X = _mm_shuffle_epi8(X, byteswap128); |
| 235 | gfmul_do_step128(gstate, X, precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 236 | } |
| 237 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 238 | static inline void gfmul_reduce128(struct ptls_fusion_gfmul_state128 *gstate) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 239 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 240 | gstate->lo = gfmul_do_reduce(gstate->hi, gstate->lo, gstate->mid); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 241 | } |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 242 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 243 | static inline __m128i gfmul_get_tag128(struct ptls_fusion_gfmul_state128 *gstate, __m128i ek0) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 244 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 245 | __m128i tag = _mm_shuffle_epi8(gstate->lo, byteswap128); |
| 246 | tag = _mm_xor_si128(tag, ek0); |
| 247 | return tag; |
| 248 | } |
| 249 | |
| 250 | struct ptls_fusion_gfmul_state256 { |
| 251 | __m256i hi, lo, mid; |
| 252 | }; |
| 253 | |
| 254 | static inline void gfmul_do_step256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, |
| 255 | union ptls_fusion_aesgcm_ghash_precompute256 *precompute) |
| 256 | { |
| 257 | __m256i t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x00); |
| 258 | gstate->lo = _mm256_xor_si256(gstate->lo, t); |
| 259 | t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x11); |
| 260 | gstate->hi = _mm256_xor_si256(gstate->hi, t); |
| 261 | t = _mm256_shuffle_epi32(X, 78); |
| 262 | t = _mm256_xor_si256(t, X); |
| 263 | t = _mm256_clmulepi64_epi128(precompute->rx2, t, 0x00); |
| 264 | gstate->mid = _mm256_xor_si256(gstate->mid, t); |
| 265 | } |
| 266 | |
| 267 | static inline void gfmul_firststep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, int half, |
| 268 | union ptls_fusion_aesgcm_ghash_precompute256 *precompute) |
| 269 | { |
| 270 | X = _mm256_shuffle_epi8(X, byteswap256); |
| 271 | X = _mm256_xor_si256(gstate->lo, X); |
| 272 | if (half) |
| 273 | X = _mm256_permute2f128_si256(X, X, 0x08); |
| 274 | gstate->lo = _mm256_setzero_si256(); |
| 275 | gstate->hi = _mm256_setzero_si256(); |
| 276 | gstate->mid = _mm256_setzero_si256(); |
| 277 | gfmul_do_step256(gstate, X, precompute); |
| 278 | } |
| 279 | |
| 280 | static inline void gfmul_nextstep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, |
| 281 | union ptls_fusion_aesgcm_ghash_precompute256 *precompute) |
| 282 | { |
| 283 | X = _mm256_shuffle_epi8(X, byteswap256); |
| 284 | gfmul_do_step256(gstate, X, precompute); |
| 285 | } |
| 286 | |
| 287 | static inline void gfmul_reduce256(struct ptls_fusion_gfmul_state256 *gstate) |
| 288 | { |
| 289 | #define XOR_256TO128(y) _mm_xor_si128(_mm256_castsi256_si128(y), _mm256_extractf128_si256((y), 1)) |
| 290 | __m128i hi = XOR_256TO128(gstate->hi); |
| 291 | __m128i lo = XOR_256TO128(gstate->lo); |
| 292 | __m128i mid = XOR_256TO128(gstate->mid); |
| 293 | #undef XOR_256TO128 |
| 294 | |
| 295 | lo = gfmul_do_reduce(hi, lo, mid); |
| 296 | gstate->lo = _mm256_castsi128_si256(lo); |
| 297 | } |
| 298 | |
| 299 | static inline __m128i gfmul_get_tag256(struct ptls_fusion_gfmul_state256 *gstate, __m128i ek0) |
| 300 | { |
| 301 | __m128i tag = _mm_shuffle_epi8(_mm256_castsi256_si128(gstate->lo), byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 302 | tag = _mm_xor_si128(tag, ek0); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 303 | return tag; |
| 304 | } |
| 305 | |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 306 | static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v) |
| 307 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 308 | #define ROUNDKEY(i) (ctx->avx256 ? _mm256_castsi256_si128(ctx->keys.m256[i]) : ctx->keys.m128[i]) |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 309 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 310 | v = _mm_xor_si128(v, ROUNDKEY(0)); |
| 311 | for (size_t i = 1; i < ctx->rounds; ++i) |
| 312 | v = _mm_aesenc_si128(v, ROUNDKEY(i)); |
| 313 | v = _mm_aesenclast_si128(v, ROUNDKEY(ctx->rounds)); |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 314 | |
| 315 | return v; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 316 | |
| 317 | #undef ROUNDKEY |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 318 | } |
| 319 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 320 | // 32-bytes of 0xff followed by 31-bytes of 0x00 |
| 321 | static const uint8_t loadn_mask[63] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 322 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 323 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 324 | static const uint8_t loadn_shuffle[31] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 325 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // first 16 bytes map to byte offsets |
| 326 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| 327 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // latter 15 bytes map to zero |
| 328 | |
Kazuho Oku | 8b9cd57 | 2022-05-04 13:44:33 +0900 | [diff] [blame] | 329 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 330 | static inline __m128i loadn_end_of_page(const void *p, size_t l) |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 331 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 332 | uintptr_t shift = (uintptr_t)p & 15; |
| 333 | __m128i pattern = _mm_loadu_si128((const __m128i *)(loadn_shuffle + shift)); |
| 334 | return _mm_shuffle_epi8(_mm_load_si128((const __m128i *)((uintptr_t)p - shift)), pattern); |
| 335 | } |
| 336 | |
Kazuho Oku | 196e477 | 2022-05-10 09:52:07 +0900 | [diff] [blame] | 337 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 338 | static inline __m128i loadn128(const void *p, size_t l) |
| 339 | { |
| 340 | __m128i v, mask = _mm_loadu_si128((__m128i *)(loadn_mask + 32 - l)); |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 341 | uintptr_t mod4k = (uintptr_t)p % 4096; |
Goro Fuji | 9a99cf1 | 2021-11-30 12:07:24 +0000 | [diff] [blame] | 342 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 343 | if (PTLS_LIKELY(mod4k <= 4096 - 16) || mod4k + l > 4096) { |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 344 | v = _mm_loadu_si128(p); |
Kazuho Oku | 079b1d0 | 2020-05-14 02:24:28 +0900 | [diff] [blame] | 345 | } else { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 346 | v = loadn_end_of_page(p, l); |
Kazuho Oku | 079b1d0 | 2020-05-14 02:24:28 +0900 | [diff] [blame] | 347 | } |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 348 | v = _mm_and_si128(v, mask); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 349 | |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 350 | return v; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 351 | } |
| 352 | |
Kazuho Oku | 196e477 | 2022-05-10 09:52:07 +0900 | [diff] [blame] | 353 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 354 | static inline __m256i loadn256(const void *p, size_t l) |
| 355 | { |
| 356 | __m256i v, mask = _mm256_loadu_si256((__m256i *)(loadn_mask + 32 - l)); |
| 357 | uintptr_t mod4k = (uintptr_t)p % 4096; |
| 358 | |
| 359 | if (PTLS_LIKELY(mod4k < 4096 - 32) || mod4k + l > 4096) { |
| 360 | v = _mm256_loadu_si256(p); |
| 361 | } else if (l > 16) { |
Kazuho Oku | 7f165e0 | 2022-05-10 07:04:06 +0900 | [diff] [blame] | 362 | __m128i first16 = _mm_loadu_si128(p), second16 = loadn128((uint8_t *)p + 16, l - 16); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 363 | v = _mm256_permute2f128_si256(_mm256_castsi128_si256(first16), _mm256_castsi128_si256(second16), 0x20); |
| 364 | } else if (l == 16) { |
| 365 | v = _mm256_castsi128_si256(_mm_loadu_si128(p)); |
| 366 | } else { |
| 367 | v = _mm256_castsi128_si256(loadn_end_of_page(p, l)); |
| 368 | } |
| 369 | v = _mm256_and_si256(v, mask); |
| 370 | |
| 371 | return v; |
| 372 | } |
| 373 | |
| 374 | static inline void storen128(void *_p, size_t l, __m128i v) |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 375 | { |
| 376 | uint8_t buf[16], *p = _p; |
| 377 | |
| 378 | *(__m128i *)buf = v; |
| 379 | |
| 380 | for (size_t i = 0; i != l; ++i) |
| 381 | p[i] = buf[i]; |
| 382 | } |
| 383 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 384 | void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr, |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 385 | const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 386 | { |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 387 | /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ |
| 388 | #define AESECB6_INIT() \ |
| 389 | do { \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 390 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 391 | bits0 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 392 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 393 | bits1 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 394 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 395 | bits2 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 396 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 397 | bits3 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 398 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 399 | bits4 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 400 | if (PTLS_LIKELY(srclen > 16 * 5)) { \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 401 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 402 | bits5 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 403 | } else { \ |
| 404 | if ((state & STATE_EK0_BEEN_FED) == 0) { \ |
| 405 | bits5 = ek0; \ |
| 406 | state |= STATE_EK0_BEEN_FED; \ |
| 407 | } \ |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 408 | if ((state & STATE_SUPP_USED) != 0 && srclen <= 16 * 4 && (const __m128i *)supp->input + 1 <= dst_ghash) { \ |
| 409 | bits4 = _mm_loadu_si128(supp->input); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 410 | bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128; \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 411 | state |= STATE_SUPP_IN_PROCESS; \ |
| 412 | } \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 413 | } \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 414 | __m128i k = ctx->super.ecb.keys.m128[0]; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 415 | bits0 = _mm_xor_si128(bits0, k); \ |
| 416 | bits1 = _mm_xor_si128(bits1, k); \ |
| 417 | bits2 = _mm_xor_si128(bits2, k); \ |
| 418 | bits3 = _mm_xor_si128(bits3, k); \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 419 | bits4 = _mm_xor_si128(bits4, bits4keys[0]); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 420 | bits5 = _mm_xor_si128(bits5, k); \ |
| 421 | } while (0) |
| 422 | |
| 423 | /* aes block update */ |
| 424 | #define AESECB6_UPDATE(i) \ |
| 425 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 426 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 427 | bits0 = _mm_aesenc_si128(bits0, k); \ |
| 428 | bits1 = _mm_aesenc_si128(bits1, k); \ |
| 429 | bits2 = _mm_aesenc_si128(bits2, k); \ |
| 430 | bits3 = _mm_aesenc_si128(bits3, k); \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 431 | bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 432 | bits5 = _mm_aesenc_si128(bits5, k); \ |
| 433 | } while (0) |
| 434 | |
| 435 | /* aesenclast */ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 436 | #define AESECB6_FINAL(i) \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 437 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 438 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 439 | bits0 = _mm_aesenclast_si128(bits0, k); \ |
| 440 | bits1 = _mm_aesenclast_si128(bits1, k); \ |
| 441 | bits2 = _mm_aesenclast_si128(bits2, k); \ |
| 442 | bits3 = _mm_aesenclast_si128(bits3, k); \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 443 | bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 444 | bits5 = _mm_aesenclast_si128(bits5, k); \ |
| 445 | } while (0) |
| 446 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 447 | struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 448 | __m128i ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 449 | const __m128i *bits4keys = ctx->super.ecb.keys.m128; /* is changed to supp->ctx.keys when calcurating suppout */ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 450 | struct ptls_fusion_gfmul_state128 gstate = {0}; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 451 | __m128i gdatabuf[6]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 452 | __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 453 | |
| 454 | // src and dst are updated after the chunk is processed |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 455 | const __m128i *src = input; |
| 456 | __m128i *dst = output; |
| 457 | size_t srclen = inlen; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 458 | // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor) |
| 459 | const __m128i *aad = _aad, *dst_ghash = dst; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 460 | size_t dst_ghashlen = srclen; |
| 461 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 462 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 463 | |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 464 | #define STATE_EK0_BEEN_FED 0x3 |
| 465 | #define STATE_EK0_INCOMPLETE 0x2 |
| 466 | #define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1) |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 467 | #define STATE_SUPP_USED 0x4 |
| 468 | #define STATE_SUPP_IN_PROCESS 0x8 |
| 469 | int32_t state = supp != NULL ? STATE_SUPP_USED : 0; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 470 | |
| 471 | /* build counter */ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 472 | ctr = _mm_insert_epi32(ctr, 1, 0); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 473 | ek0 = _mm_shuffle_epi8(ctr, byteswap128); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 474 | |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 475 | /* start preparing AES */ |
| 476 | AESECB6_INIT(); |
| 477 | AESECB6_UPDATE(1); |
| 478 | |
| 479 | /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */ |
| 480 | const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH |
| 481 | size_t gdata_cnt = 0; |
| 482 | if (PTLS_LIKELY(aadlen != 0)) { |
| 483 | while (gdata_cnt < 6) { |
| 484 | if (PTLS_LIKELY(aadlen < 16)) { |
| 485 | if (aadlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 486 | gdatabuf[gdata_cnt++] = loadn128(aad, aadlen); |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 487 | aadlen = 0; |
| 488 | } |
| 489 | goto MainLoop; |
| 490 | } |
| 491 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); |
| 492 | aadlen -= 16; |
| 493 | } |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 494 | } |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 495 | |
| 496 | /* the main loop */ |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 497 | MainLoop: |
Kazuho Oku | 303153d | 2020-05-08 16:42:16 +0900 | [diff] [blame] | 498 | while (1) { |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 499 | /* run AES and multiplication in parallel */ |
| 500 | size_t i; |
| 501 | for (i = 2; i < gdata_cnt + 2; ++i) { |
| 502 | AESECB6_UPDATE(i); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 503 | gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 504 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 505 | for (; i < ctx->super.ecb.rounds; ++i) |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 506 | AESECB6_UPDATE(i); |
| 507 | AESECB6_FINAL(i); |
| 508 | |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 509 | /* apply the bit stream to src and write to dest */ |
| 510 | if (PTLS_LIKELY(srclen >= 6 * 16)) { |
| 511 | #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i)) |
| 512 | APPLY(0); |
| 513 | APPLY(1); |
| 514 | APPLY(2); |
| 515 | APPLY(3); |
| 516 | APPLY(4); |
| 517 | APPLY(5); |
Kazuho Oku | 083f531 | 2020-05-07 13:05:10 +0900 | [diff] [blame] | 518 | #undef APPLY |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 519 | dst += 6; |
| 520 | src += 6; |
| 521 | srclen -= 6 * 16; |
| 522 | } else { |
| 523 | if ((state & STATE_EK0_BEEN_FED) == STATE_EK0_BEEN_FED) { |
| 524 | ek0 = bits5; |
| 525 | state &= ~STATE_EK0_INCOMPLETE; |
| 526 | } |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 527 | if ((state & STATE_SUPP_IN_PROCESS) != 0) { |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 528 | _mm_storeu_si128((__m128i *)supp->output, bits4); |
| 529 | state &= ~(STATE_SUPP_USED | STATE_SUPP_IN_PROCESS); |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 530 | } |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 531 | if (srclen != 0) { |
| 532 | #define APPLY(i) \ |
| 533 | do { \ |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 534 | if (PTLS_LIKELY(srclen >= 16)) { \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 535 | _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \ |
| 536 | srclen -= 16; \ |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 537 | } else if (PTLS_LIKELY(srclen != 0)) { \ |
| 538 | bits0 = bits##i; \ |
| 539 | goto ApplyRemainder; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 540 | } else { \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 541 | goto ApplyEnd; \ |
| 542 | } \ |
| 543 | } while (0) |
| 544 | APPLY(0); |
| 545 | APPLY(1); |
| 546 | APPLY(2); |
| 547 | APPLY(3); |
| 548 | APPLY(4); |
| 549 | APPLY(5); |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 550 | #undef APPLY |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 551 | goto ApplyEnd; |
| 552 | ApplyRemainder: |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 553 | storen128(dst, srclen, _mm_xor_si128(loadn128(src, srclen), bits0)); |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 554 | dst = (__m128i *)((uint8_t *)dst + srclen); |
| 555 | srclen = 0; |
| 556 | ApplyEnd:; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 557 | } |
| 558 | } |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 559 | |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 560 | /* next block AES starts here */ |
| 561 | AESECB6_INIT(); |
| 562 | |
| 563 | AESECB6_UPDATE(1); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 564 | |
| 565 | /* setup gdata */ |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 566 | if (PTLS_UNLIKELY(aadlen != 0)) { |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 567 | gdata_cnt = 0; |
| 568 | while (gdata_cnt < 6) { |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 569 | if (aadlen < 16) { |
| 570 | if (aadlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 571 | gdatabuf[gdata_cnt++] = loadn128(aad, aadlen); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 572 | aadlen = 0; |
| 573 | } |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 574 | goto GdataFillDST; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 575 | } |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 576 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 577 | aadlen -= 16; |
| 578 | } |
| 579 | gdata = gdatabuf; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 580 | } else if (PTLS_LIKELY(dst_ghashlen >= 6 * 16)) { |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 581 | gdata = dst_ghash; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 582 | gdata_cnt = 6; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 583 | dst_ghash += 6; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 584 | dst_ghashlen -= 96; |
| 585 | } else { |
| 586 | gdata_cnt = 0; |
| 587 | GdataFillDST: |
| 588 | while (gdata_cnt < 6) { |
| 589 | if (dst_ghashlen < 16) { |
| 590 | if (dst_ghashlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 591 | gdatabuf[gdata_cnt++] = loadn128(dst_ghash, dst_ghashlen); |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 592 | dst_ghashlen = 0; |
| 593 | } |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 594 | if (gdata_cnt < 6) |
| 595 | goto Finish; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 596 | break; |
| 597 | } |
| 598 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++); |
| 599 | dst_ghashlen -= 16; |
| 600 | } |
| 601 | gdata = gdatabuf; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 602 | } |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 603 | } |
| 604 | |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 605 | Finish: |
| 606 | gdatabuf[gdata_cnt++] = ac; |
| 607 | |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 608 | /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation. |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 609 | * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM |
| 610 | * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead. |
| 611 | */ |
| 612 | assert(STATE_EK0_READY()); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 613 | for (size_t i = 0; i < gdata_cnt; ++i) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 614 | gfmul_nextstep128(&gstate, gdatabuf[i], --ghash_precompute); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 615 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 616 | gfmul_reduce128(&gstate); |
| 617 | _mm_storeu_si128(dst, gfmul_get_tag128(&gstate, ek0)); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 618 | |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 619 | /* Finish the calculation of supplemental vector. Done at the very last, because the sample might cover the GCM tag. */ |
| 620 | if ((state & STATE_SUPP_USED) != 0) { |
| 621 | size_t i; |
| 622 | if ((state & STATE_SUPP_IN_PROCESS) == 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 623 | bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128; |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 624 | bits4 = _mm_xor_si128(_mm_loadu_si128(supp->input), bits4keys[0]); |
| 625 | i = 1; |
| 626 | } else { |
| 627 | i = 2; |
| 628 | } |
| 629 | do { |
| 630 | bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 631 | } while (i != ctx->super.ecb.rounds); |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 632 | bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 633 | _mm_storeu_si128((__m128i *)supp->output, bits4); |
| 634 | } |
| 635 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 636 | #undef AESECB6_INIT |
| 637 | #undef AESECB6_UPDATE |
| 638 | #undef AESECB6_FINAL |
| 639 | #undef STATE_EK0_BEEN_FOUND |
| 640 | #undef STATE_EK0_READY |
| 641 | #undef STATE_SUPP_IN_PROCESS |
| 642 | } |
| 643 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 644 | int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr, |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 645 | const void *_aad, size_t aadlen, const void *tag) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 646 | { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 647 | struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 648 | __m128i ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(), |
| 649 | bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128(); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 650 | struct ptls_fusion_gfmul_state128 gstate = {0}; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 651 | __m128i gdatabuf[6]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 652 | __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 653 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 654 | |
| 655 | const __m128i *gdata; // points to the elements fed into GHASH |
| 656 | size_t gdata_cnt; |
| 657 | |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 658 | const __m128i *src_ghash = input, *src_aes = input, *aad = _aad; |
| 659 | __m128i *dst = output; |
| 660 | size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 661 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 662 | /* schedule ek0 and suppkey */ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 663 | ctr = _mm_add_epi64(ctr, one8); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 664 | bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), ctx->super.ecb.keys.m128[0]); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 665 | ++nondata_aes_cnt; |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 666 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 667 | #define STATE_IS_FIRST_RUN 0x1 |
| 668 | #define STATE_GHASH_HAS_MORE 0x2 |
| 669 | int state = STATE_IS_FIRST_RUN | STATE_GHASH_HAS_MORE; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 670 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 671 | /* the main loop */ |
| 672 | while (1) { |
| 673 | |
| 674 | /* setup gdata */ |
| 675 | if (PTLS_UNLIKELY(aadlen != 0)) { |
| 676 | gdata = gdatabuf; |
| 677 | gdata_cnt = 0; |
| 678 | while (gdata_cnt < 6) { |
| 679 | if (aadlen < 16) { |
| 680 | if (aadlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 681 | gdatabuf[gdata_cnt++] = loadn128(aad, aadlen); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 682 | aadlen = 0; |
| 683 | ++nondata_aes_cnt; |
| 684 | } |
| 685 | goto GdataFillSrc; |
| 686 | } |
| 687 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); |
| 688 | aadlen -= 16; |
| 689 | ++nondata_aes_cnt; |
| 690 | } |
| 691 | } else if (PTLS_LIKELY(src_ghashlen >= 6 * 16)) { |
| 692 | gdata = src_ghash; |
| 693 | gdata_cnt = 6; |
| 694 | src_ghash += 6; |
| 695 | src_ghashlen -= 6 * 16; |
| 696 | } else { |
| 697 | gdata = gdatabuf; |
| 698 | gdata_cnt = 0; |
| 699 | GdataFillSrc: |
| 700 | while (gdata_cnt < 6) { |
| 701 | if (src_ghashlen < 16) { |
| 702 | if (src_ghashlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 703 | gdatabuf[gdata_cnt++] = loadn128(src_ghash, src_ghashlen); |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 704 | src_ghash = (__m128i *)((uint8_t *)src_ghash + src_ghashlen); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 705 | src_ghashlen = 0; |
| 706 | } |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 707 | if (gdata_cnt < 6 && (state & STATE_GHASH_HAS_MORE) != 0) { |
| 708 | gdatabuf[gdata_cnt++] = ac; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 709 | state &= ~STATE_GHASH_HAS_MORE; |
| 710 | } |
| 711 | break; |
| 712 | } |
| 713 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(src_ghash++); |
| 714 | src_ghashlen -= 16; |
| 715 | } |
| 716 | } |
| 717 | |
| 718 | /* setup aes bits */ |
| 719 | if (PTLS_LIKELY(nondata_aes_cnt == 0)) |
| 720 | goto InitAllBits; |
| 721 | switch (nondata_aes_cnt) { |
Kazuho Oku | 66a95e5 | 2020-05-12 12:57:37 +0900 | [diff] [blame] | 722 | #define INIT_BITS(n, keys) \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 723 | case n: \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 724 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 725 | bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), keys.m128[0]); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 726 | InitAllBits: |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 727 | INIT_BITS(0, ctx->super.ecb.keys); |
| 728 | INIT_BITS(1, ctx->super.ecb.keys); |
| 729 | INIT_BITS(2, ctx->super.ecb.keys); |
| 730 | INIT_BITS(3, ctx->super.ecb.keys); |
| 731 | INIT_BITS(4, ctx->super.ecb.keys); |
| 732 | INIT_BITS(5, ctx->super.ecb.keys); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 733 | #undef INIT_BITS |
| 734 | } |
| 735 | |
| 736 | { /* run aes and ghash */ |
| 737 | #define AESECB6_UPDATE(i) \ |
| 738 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 739 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 740 | bits0 = _mm_aesenc_si128(bits0, k); \ |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 741 | bits1 = _mm_aesenc_si128(bits1, k); \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 742 | bits2 = _mm_aesenc_si128(bits2, k); \ |
| 743 | bits3 = _mm_aesenc_si128(bits3, k); \ |
| 744 | bits4 = _mm_aesenc_si128(bits4, k); \ |
| 745 | bits5 = _mm_aesenc_si128(bits5, k); \ |
| 746 | } while (0) |
| 747 | |
| 748 | size_t aesi; |
| 749 | for (aesi = 1; aesi <= gdata_cnt; ++aesi) { |
| 750 | AESECB6_UPDATE(aesi); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 751 | gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 752 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 753 | for (; aesi < ctx->super.ecb.rounds; ++aesi) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 754 | AESECB6_UPDATE(aesi); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 755 | __m128i k = ctx->super.ecb.keys.m128[aesi]; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 756 | bits0 = _mm_aesenclast_si128(bits0, k); |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 757 | bits1 = _mm_aesenclast_si128(bits1, k); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 758 | bits2 = _mm_aesenclast_si128(bits2, k); |
| 759 | bits3 = _mm_aesenclast_si128(bits3, k); |
| 760 | bits4 = _mm_aesenclast_si128(bits4, k); |
| 761 | bits5 = _mm_aesenclast_si128(bits5, k); |
| 762 | |
| 763 | #undef AESECB6_UPDATE |
| 764 | } |
| 765 | |
| 766 | /* apply aes bits */ |
| 767 | if (PTLS_LIKELY(nondata_aes_cnt == 0 && src_aeslen >= 6 * 16)) { |
| 768 | #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src_aes + i), bits##i)) |
| 769 | APPLY(0); |
| 770 | APPLY(1); |
| 771 | APPLY(2); |
| 772 | APPLY(3); |
| 773 | APPLY(4); |
| 774 | APPLY(5); |
| 775 | #undef APPLY |
| 776 | dst += 6; |
| 777 | src_aes += 6; |
| 778 | src_aeslen -= 6 * 16; |
| 779 | } else { |
| 780 | if ((state & STATE_IS_FIRST_RUN) != 0) { |
| 781 | ek0 = bits0; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 782 | state &= ~STATE_IS_FIRST_RUN; |
| 783 | } |
| 784 | switch (nondata_aes_cnt) { |
| 785 | #define APPLY(i) \ |
| 786 | case i: \ |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 787 | if (PTLS_LIKELY(src_aeslen > 16)) { \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 788 | _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \ |
| 789 | src_aeslen -= 16; \ |
| 790 | } else { \ |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 791 | bits0 = bits##i; \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 792 | goto Finish; \ |
| 793 | } |
| 794 | APPLY(0); |
| 795 | APPLY(1); |
| 796 | APPLY(2); |
| 797 | APPLY(3); |
| 798 | APPLY(4); |
| 799 | APPLY(5); |
| 800 | #undef APPLY |
| 801 | } |
| 802 | nondata_aes_cnt = 0; |
| 803 | } |
| 804 | } |
| 805 | |
| 806 | Finish: |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 807 | if (src_aeslen == 16) { |
| 808 | _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits0)); |
| 809 | } else if (src_aeslen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 810 | storen128(dst, src_aeslen, _mm_xor_si128(loadn128(src_aes, src_aeslen), bits0)); |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 811 | } |
| 812 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 813 | assert((state & STATE_IS_FIRST_RUN) == 0); |
| 814 | |
| 815 | /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */ |
| 816 | if ((state & STATE_GHASH_HAS_MORE) != 0) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 817 | assert(ghash_precompute - 1 == ctx->ghash); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 818 | gfmul_nextstep128(&gstate, ac, --ghash_precompute); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 819 | } |
| 820 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 821 | gfmul_reduce128(&gstate); |
| 822 | __m128i calctag = gfmul_get_tag128(&gstate, ek0); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 823 | |
| 824 | return _mm_movemask_epi8(_mm_cmpeq_epi8(calctag, _mm_loadu_si128(tag))) == 0xffff; |
| 825 | |
| 826 | #undef STATE_IS_FIRST_RUN |
| 827 | #undef STATE_GHASH_HAS_MORE |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 828 | } |
| 829 | |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 830 | static __m128i expand_key(__m128i key, __m128i temp) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 831 | { |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 832 | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
| 833 | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
| 834 | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 835 | |
| 836 | key = _mm_xor_si128(key, temp); |
| 837 | |
| 838 | return key; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 839 | } |
| 840 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 841 | void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size, int avx256) |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 842 | { |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 843 | assert(is_enc && "decryption is not supported (yet)"); |
| 844 | |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 845 | size_t i = 0; |
| 846 | |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 847 | switch (key_size) { |
| 848 | case 16: /* AES128 */ |
| 849 | ctx->rounds = 10; |
| 850 | break; |
| 851 | case 32: /* AES256 */ |
| 852 | ctx->rounds = 14; |
| 853 | break; |
| 854 | default: |
| 855 | assert(!"invalid key size; AES128 / AES256 are supported"); |
| 856 | break; |
| 857 | } |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 858 | ctx->avx256 = avx256; |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 859 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 860 | /* load and expand keys using keys.m128 */ |
| 861 | ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key); |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 862 | if (key_size == 32) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 863 | ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key + 1); |
| 864 | while (1) { |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 865 | #define EXPAND(R) \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 866 | { \ |
| 867 | ctx->keys.m128[i] = \ |
| 868 | expand_key(ctx->keys.m128[i - key_size / 16], \ |
| 869 | _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(3, 3, 3, 3))); \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 870 | if (i == ctx->rounds) \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 871 | break; \ |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 872 | ++i; \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 873 | if (key_size > 24) { \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 874 | ctx->keys.m128[i] = \ |
| 875 | expand_key(ctx->keys.m128[i - key_size / 16], \ |
| 876 | _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(2, 2, 2, 2))); \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 877 | ++i; \ |
| 878 | } \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 879 | } |
| 880 | EXPAND(0x1); |
| 881 | EXPAND(0x2); |
| 882 | EXPAND(0x4); |
| 883 | EXPAND(0x8); |
| 884 | EXPAND(0x10); |
| 885 | EXPAND(0x20); |
| 886 | EXPAND(0x40); |
| 887 | EXPAND(0x80); |
| 888 | EXPAND(0x1b); |
| 889 | EXPAND(0x36); |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 890 | #undef EXPAND |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 891 | } |
| 892 | |
| 893 | /* convert to keys.m256 if avx256 is used */ |
| 894 | if (ctx->avx256) { |
| 895 | size_t i = ctx->rounds; |
| 896 | do { |
| 897 | ctx->keys.m256[i] = _mm256_broadcastsi128_si256(ctx->keys.m128[i]); |
| 898 | } while (i-- != 0); |
| 899 | } |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 900 | } |
| 901 | |
| 902 | void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx) |
| 903 | { |
| 904 | ptls_clear_memory(ctx, sizeof(*ctx)); |
| 905 | } |
| 906 | |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 907 | void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src) |
| 908 | { |
| 909 | __m128i v = _mm_loadu_si128(src); |
| 910 | v = aesecb_encrypt(ctx, v); |
| 911 | _mm_storeu_si128(dst, v); |
| 912 | } |
| 913 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 914 | /** |
| 915 | * returns the number of ghash entries that is required to handle an AEAD block of given size |
| 916 | */ |
| 917 | static size_t aesgcm_calc_ghash_cnt(size_t capacity) |
| 918 | { |
| 919 | // round-up by block size, add to handle worst split of the size between AAD and payload, plus context to hash AC |
| 920 | return (capacity + 15) / 16 + 2; |
| 921 | } |
| 922 | |
| 923 | static void setup_one_ghash_entry(ptls_fusion_aesgcm_context_t *ctx) |
| 924 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 925 | __m128i *H, *r, *Hprev, H0; |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 926 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 927 | if (ctx->ecb.avx256) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 928 | struct ptls_fusion_aesgcm_context256 *ctx256 = (void *)ctx; |
| 929 | #define GET_SLOT(i, mem) (&ctx256->ghash[(i) / 2].mem[(i) % 2 == 0]) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 930 | H = GET_SLOT(ctx->ghash_cnt, H); |
| 931 | r = GET_SLOT(ctx->ghash_cnt, r); |
| 932 | Hprev = ctx->ghash_cnt == 0 ? NULL : GET_SLOT(ctx->ghash_cnt - 1, H); |
| 933 | #undef GET_SLOT |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 934 | H0 = ctx256->ghash[0].H[1]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 935 | } else { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 936 | struct ptls_fusion_aesgcm_context128 *ctx128 = (void *)ctx; |
| 937 | H = &ctx128->ghash[ctx->ghash_cnt].H; |
| 938 | r = &ctx128->ghash[ctx->ghash_cnt].r; |
| 939 | Hprev = ctx->ghash_cnt == 0 ? NULL : &ctx128->ghash[ctx->ghash_cnt - 1].H; |
| 940 | H0 = ctx128->ghash[0].H; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 941 | } |
| 942 | |
| 943 | if (Hprev != NULL) |
| 944 | *H = gfmul(*Hprev, H0); |
| 945 | |
| 946 | *r = _mm_shuffle_epi32(*H, 78); |
| 947 | *r = _mm_xor_si128(*r, *H); |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 948 | |
| 949 | ++ctx->ghash_cnt; |
| 950 | } |
| 951 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 952 | static size_t calc_aesgcm_context_size(size_t *ghash_cnt, int avx256) |
| 953 | { |
| 954 | size_t sz; |
| 955 | |
| 956 | if (avx256) { |
| 957 | if (*ghash_cnt % 2 != 0) |
| 958 | ++*ghash_cnt; |
| 959 | sz = offsetof(struct ptls_fusion_aesgcm_context256, ghash) + |
| 960 | sizeof(union ptls_fusion_aesgcm_ghash_precompute256) * *ghash_cnt / 2; |
| 961 | } else { |
| 962 | sz = offsetof(struct ptls_fusion_aesgcm_context128, ghash) + |
| 963 | sizeof(struct ptls_fusion_aesgcm_ghash_precompute128) * *ghash_cnt; |
| 964 | } |
| 965 | return sz; |
| 966 | } |
| 967 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 968 | static ptls_fusion_aesgcm_context_t *new_aesgcm(const void *key, size_t key_size, size_t capacity, int avx256) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 969 | { |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 970 | ptls_fusion_aesgcm_context_t *ctx; |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 971 | size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity), ctx_size = calc_aesgcm_context_size(&ghash_cnt, avx256); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 972 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 973 | if ((ctx = aligned_alloc(32, ctx_size)) == NULL) |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 974 | return NULL; |
| 975 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 976 | ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size, avx256); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 977 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 978 | ctx->capacity = capacity; |
| 979 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 980 | __m128i H0 = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128()); |
| 981 | H0 = _mm_shuffle_epi8(H0, byteswap128); |
| 982 | H0 = transformH(H0); |
| 983 | if (ctx->ecb.avx256) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 984 | ((struct ptls_fusion_aesgcm_context256 *)ctx)->ghash[0].H[1] = H0; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 985 | } else { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 986 | ((struct ptls_fusion_aesgcm_context128 *)ctx)->ghash[0].H = H0; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 987 | } |
| 988 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 989 | ctx->ghash_cnt = 0; |
| 990 | while (ctx->ghash_cnt < ghash_cnt) |
| 991 | setup_one_ghash_entry(ctx); |
| 992 | |
| 993 | return ctx; |
| 994 | } |
| 995 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 996 | ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity) |
| 997 | { |
| 998 | return new_aesgcm(key, key_size, capacity, 0); |
| 999 | } |
| 1000 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1001 | ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity) |
| 1002 | { |
| 1003 | size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity); |
| 1004 | |
| 1005 | if (ghash_cnt <= ctx->ghash_cnt) |
| 1006 | return ctx; |
| 1007 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1008 | size_t ctx_size = calc_aesgcm_context_size(&ghash_cnt, ctx->ecb.avx256); |
| 1009 | if ((ctx = realloc(ctx, ctx_size)) == NULL) |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1010 | return NULL; |
| 1011 | |
| 1012 | ctx->capacity = capacity; |
| 1013 | while (ghash_cnt < ctx->ghash_cnt) |
| 1014 | setup_one_ghash_entry(ctx); |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 1015 | |
| 1016 | return ctx; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1017 | } |
| 1018 | |
Kazuho Oku | 31ebd7d | 2020-05-15 06:37:23 +0900 | [diff] [blame] | 1019 | void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1020 | { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1021 | ptls_clear_memory(ctx, calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.avx256)); |
| 1022 | /* skip ptls_fusion_aesecb_dispose, based on the knowledge that it does not allocate memory elsewhere */ |
| 1023 | |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 1024 | free(ctx); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1025 | } |
| 1026 | |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1027 | static void ctr_dispose(ptls_cipher_context_t *_ctx) |
| 1028 | { |
| 1029 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1030 | ptls_fusion_aesecb_dispose(&ctx->fusion); |
| 1031 | _mm_storeu_si128(&ctx->bits, _mm_setzero_si128()); |
| 1032 | } |
| 1033 | |
| 1034 | static void ctr_init(ptls_cipher_context_t *_ctx, const void *iv) |
| 1035 | { |
| 1036 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1037 | _mm_storeu_si128(&ctx->bits, aesecb_encrypt(&ctx->fusion, _mm_loadu_si128(iv))); |
| 1038 | ctx->is_ready = 1; |
| 1039 | } |
| 1040 | |
| 1041 | static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void *input, size_t len) |
| 1042 | { |
| 1043 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1044 | |
| 1045 | assert((ctx->is_ready && len <= 16) || |
| 1046 | !"CTR transfomation is supported only once per call to `init` and the maximum size is limited to 16 bytes"); |
| 1047 | ctx->is_ready = 0; |
| 1048 | |
| 1049 | if (len < 16) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1050 | storen128(output, len, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), loadn128(input, len))); |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1051 | } else { |
| 1052 | _mm_storeu_si128(output, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), _mm_loadu_si128(input))); |
| 1053 | } |
| 1054 | } |
| 1055 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1056 | static int aesctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size) |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1057 | { |
| 1058 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1059 | |
| 1060 | ctx->super.do_dispose = ctr_dispose; |
| 1061 | ctx->super.do_init = ctr_init; |
| 1062 | ctx->super.do_transform = ctr_transform; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1063 | ptls_fusion_aesecb_init(&ctx->fusion, 1, key, key_size, 0 /* probably we do not need avx256 for CTR? */); |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1064 | ctx->is_ready = 0; |
| 1065 | |
| 1066 | return 0; |
| 1067 | } |
| 1068 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1069 | static int aes128ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key) |
| 1070 | { |
| 1071 | return aesctr_setup(ctx, is_enc, key, PTLS_AES128_KEY_SIZE); |
| 1072 | } |
| 1073 | |
| 1074 | static int aes256ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key) |
| 1075 | { |
| 1076 | return aesctr_setup(ctx, is_enc, key, PTLS_AES256_KEY_SIZE); |
| 1077 | } |
| 1078 | |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1079 | static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) |
| 1080 | { |
| 1081 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
| 1082 | |
Kazuho Oku | 31ebd7d | 2020-05-15 06:37:23 +0900 | [diff] [blame] | 1083 | ptls_fusion_aesgcm_free(ctx->aesgcm); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1084 | } |
| 1085 | |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1086 | static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen) |
| 1087 | { |
| 1088 | assert(!"FIXME"); |
| 1089 | } |
| 1090 | |
| 1091 | static size_t aead_do_encrypt_update(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen) |
| 1092 | { |
| 1093 | assert(!"FIXME"); |
| 1094 | return SIZE_MAX; |
| 1095 | } |
| 1096 | |
| 1097 | static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) |
| 1098 | { |
| 1099 | assert(!"FIXME"); |
| 1100 | return SIZE_MAX; |
| 1101 | } |
| 1102 | |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1103 | static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1104 | { |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1105 | __m128i ctr = _mm_setzero_si128(); |
| 1106 | ctr = _mm_insert_epi64(ctr, seq, 0); |
Kazuho Oku | 076982f | 2020-05-14 09:28:44 +0900 | [diff] [blame] | 1107 | ctr = _mm_slli_si128(ctr, 4); |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1108 | ctr = _mm_xor_si128(ctx->static_iv, ctr); |
| 1109 | return ctr; |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1110 | } |
| 1111 | |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1112 | void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1113 | const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1114 | { |
| 1115 | struct aesgcm_context *ctx = (void *)_ctx; |
| 1116 | |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1117 | if (inlen + aadlen > ctx->aesgcm->capacity) |
| 1118 | ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, inlen + aadlen); |
| 1119 | ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad, aadlen, supp); |
Kazuho Oku | 3a50ee1 | 2022-04-27 16:20:16 +0900 | [diff] [blame] | 1120 | } |
| 1121 | |
| 1122 | static void aead_do_encrypt_v(struct st_ptls_aead_context_t *ctx, void *output, ptls_iovec_t *input, size_t incnt, uint64_t seq, |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1123 | const void *aad, size_t aadlen) |
Kazuho Oku | 3a50ee1 | 2022-04-27 16:20:16 +0900 | [diff] [blame] | 1124 | { |
| 1125 | assert(!"FIXME"); |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1126 | } |
| 1127 | |
| 1128 | static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1129 | const void *aad, size_t aadlen) |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1130 | { |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1131 | struct aesgcm_context *ctx = (void *)_ctx; |
| 1132 | |
| 1133 | if (inlen < 16) |
| 1134 | return SIZE_MAX; |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1135 | |
| 1136 | size_t enclen = inlen - 16; |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1137 | if (enclen + aadlen > ctx->aesgcm->capacity) |
| 1138 | ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, enclen + aadlen); |
| 1139 | if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad, aadlen, |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1140 | (const uint8_t *)input + enclen)) |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1141 | return SIZE_MAX; |
| 1142 | return enclen; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1143 | } |
| 1144 | |
Christian Huitema | 21c2d3e | 2020-12-06 16:48:12 -0800 | [diff] [blame] | 1145 | static inline void aesgcm_xor_iv(ptls_aead_context_t *_ctx, const void *_bytes, size_t len) |
Christian Huitema | 4f8c485 | 2020-12-05 20:31:30 -0800 | [diff] [blame] | 1146 | { |
| 1147 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1148 | __m128i xor_mask = loadn128(_bytes, len); |
| 1149 | xor_mask = _mm_shuffle_epi8(xor_mask, byteswap128); |
Christian Huitema | 4f8c485 | 2020-12-05 20:31:30 -0800 | [diff] [blame] | 1150 | ctx->static_iv = _mm_xor_si128(ctx->static_iv, xor_mask); |
| 1151 | } |
| 1152 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1153 | static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1154 | { |
| 1155 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
| 1156 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1157 | ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE); |
| 1158 | ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128); |
Kazuho Oku | ae2aeda | 2020-06-14 15:13:18 +0900 | [diff] [blame] | 1159 | if (key == NULL) |
| 1160 | return 0; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1161 | |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1162 | ctx->super.dispose_crypto = aesgcm_dispose_crypto; |
Christian Huitema | 4a4bc22 | 2020-12-06 16:48:02 -0800 | [diff] [blame] | 1163 | ctx->super.do_xor_iv = aesgcm_xor_iv; |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1164 | ctx->super.do_encrypt_init = aead_do_encrypt_init; |
| 1165 | ctx->super.do_encrypt_update = aead_do_encrypt_update; |
| 1166 | ctx->super.do_encrypt_final = aead_do_encrypt_final; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1167 | ctx->super.do_encrypt = aead_do_encrypt; |
Kazuho Oku | 3a50ee1 | 2022-04-27 16:20:16 +0900 | [diff] [blame] | 1168 | ctx->super.do_encrypt_v = aead_do_encrypt_v; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1169 | ctx->super.do_decrypt = aead_do_decrypt; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1170 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1171 | ctx->aesgcm = new_aesgcm(key, key_size, 1500 /* assume ordinary packet size */, 0 /* no support for avx256 yet */); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1172 | |
| 1173 | return 0; |
| 1174 | } |
| 1175 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1176 | static int aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
| 1177 | { |
| 1178 | return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE); |
| 1179 | } |
| 1180 | |
| 1181 | static int aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
| 1182 | { |
| 1183 | return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE); |
| 1184 | } |
| 1185 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1186 | int ptls_fusion_can_avx256 = 0; |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1187 | ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR", |
| 1188 | PTLS_AES128_KEY_SIZE, |
| 1189 | 1, // block size |
| 1190 | PTLS_AES_IV_SIZE, |
| 1191 | sizeof(struct ctr_context), |
| 1192 | aes128ctr_setup}; |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1193 | ptls_cipher_algorithm_t ptls_fusion_aes256ctr = {"AES256-CTR", |
| 1194 | PTLS_AES256_KEY_SIZE, |
| 1195 | 1, // block size |
| 1196 | PTLS_AES_IV_SIZE, |
| 1197 | sizeof(struct ctr_context), |
| 1198 | aes256ctr_setup}; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1199 | ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", |
Christian Huitema | 11b75d5 | 2020-09-11 23:01:38 -0700 | [diff] [blame] | 1200 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 1201 | PTLS_AESGCM_INTEGRITY_LIMIT, |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1202 | &ptls_fusion_aes128ctr, |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1203 | NULL, // &ptls_fusion_aes128ecb, |
| 1204 | PTLS_AES128_KEY_SIZE, |
| 1205 | PTLS_AESGCM_IV_SIZE, |
| 1206 | PTLS_AESGCM_TAG_SIZE, |
| 1207 | sizeof(struct aesgcm_context), |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1208 | aes128gcm_setup}; |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1209 | ptls_aead_algorithm_t ptls_fusion_aes256gcm = {"AES256-GCM", |
Christian Huitema | 11b75d5 | 2020-09-11 23:01:38 -0700 | [diff] [blame] | 1210 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 1211 | PTLS_AESGCM_INTEGRITY_LIMIT, |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1212 | &ptls_fusion_aes256ctr, |
| 1213 | NULL, // &ptls_fusion_aes256ecb, |
| 1214 | PTLS_AES256_KEY_SIZE, |
| 1215 | PTLS_AESGCM_IV_SIZE, |
| 1216 | PTLS_AESGCM_TAG_SIZE, |
| 1217 | sizeof(struct aesgcm_context), |
| 1218 | aes256gcm_setup}; |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 1219 | |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1220 | static inline size_t calc_total_length(ptls_iovec_t *input, size_t incnt) |
| 1221 | { |
| 1222 | size_t totlen = 0; |
| 1223 | for (size_t i = 0; i < incnt; ++i) |
| 1224 | totlen += input[i].len; |
| 1225 | return totlen; |
| 1226 | } |
| 1227 | |
Kazuho Oku | a7006dc | 2022-05-09 17:24:31 +0900 | [diff] [blame] | 1228 | static inline void reduce_aad128(struct ptls_fusion_gfmul_state128 *gstate, struct ptls_fusion_aesgcm_ghash_precompute128 *ghash, |
| 1229 | const void *_aad, size_t aadlen) |
| 1230 | { |
| 1231 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute; |
| 1232 | const uint8_t *aad = _aad; |
| 1233 | |
| 1234 | while (PTLS_UNLIKELY(aadlen >= 6 * 16)) { |
| 1235 | ghash_precompute = ghash + 6; |
| 1236 | gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1237 | aad += 16; |
| 1238 | aadlen -= 16; |
| 1239 | for (int i = 1; i < 6; ++i) { |
| 1240 | gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1241 | aad += 16; |
| 1242 | aadlen -= 16; |
| 1243 | } |
| 1244 | gfmul_reduce128(gstate); |
| 1245 | } |
| 1246 | |
| 1247 | if (PTLS_LIKELY(aadlen != 0)) { |
| 1248 | ghash_precompute = ghash + (aadlen + 15) / 16; |
| 1249 | if (PTLS_UNLIKELY(aadlen >= 16)) { |
| 1250 | gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1251 | aad += 16; |
| 1252 | aadlen -= 16; |
| 1253 | while (aadlen >= 16) { |
| 1254 | gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1255 | aad += 16; |
| 1256 | aadlen -= 16; |
| 1257 | } |
| 1258 | if (PTLS_LIKELY(aadlen != 0)) |
| 1259 | gfmul_nextstep128(gstate, loadn128(aad, aadlen), --ghash_precompute); |
| 1260 | } else { |
| 1261 | gfmul_firststep128(gstate, loadn128(aad, aadlen), --ghash_precompute); |
| 1262 | } |
| 1263 | assert(ghash == ghash_precompute); |
| 1264 | gfmul_reduce128(gstate); |
| 1265 | } |
| 1266 | } |
| 1267 | |
Kazuho Oku | 13ced82 | 2022-05-10 10:09:54 +0900 | [diff] [blame^] | 1268 | NO_SANITIZE_ADDRESS |
| 1269 | static inline uint8_t *load_preceding_unaligned(uint8_t *encbuf, uint8_t **output) |
| 1270 | { |
| 1271 | uint8_t *encp; |
| 1272 | |
| 1273 | if ((encp = encbuf + ((uintptr_t)*output & 63)) != encbuf) { |
| 1274 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(*output - (encp - encbuf)))); |
| 1275 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(*output - (encp - encbuf) + 32))); |
| 1276 | *output -= encp - encbuf; |
| 1277 | } |
| 1278 | |
| 1279 | return encp; |
| 1280 | } |
| 1281 | |
| 1282 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1283 | static inline void write_remaining_bytes(uint8_t *dst, const uint8_t *src, const uint8_t *end) |
| 1284 | { |
| 1285 | /* Write in 64-byte chunks, using NT store instructions. Last partial block, if any, is written to cache, as that cache line |
| 1286 | * would likely be read when the next TLS record is being built. */ |
| 1287 | |
| 1288 | for (; end - src >= 64; dst += 64, src += 64) { |
| 1289 | _mm256_stream_si256((void *)dst, _mm256_load_si256((void *)src)); |
| 1290 | _mm256_stream_si256((void *)(dst + 32), _mm256_load_si256((void *)(src + 32))); |
| 1291 | } |
| 1292 | _mm_sfence(); /* weakly ordered writes have to be synced before being passed to NIC */ |
| 1293 | if (src != end) { |
| 1294 | for (; end - src >= 16; dst += 16, src += 16) |
| 1295 | _mm_store_si128((void *)dst, _mm_load_si128((void *)src)); |
| 1296 | if (src != end) |
| 1297 | storen128((void *)dst, end - src, loadn128((void *)src, end - src)); |
| 1298 | } |
| 1299 | } |
| 1300 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1301 | static void non_temporal_encrypt_v128(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt, |
Kazuho Oku | a7006dc | 2022-05-09 17:24:31 +0900 | [diff] [blame] | 1302 | uint64_t seq, const void *aad, size_t aadlen) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1303 | { |
| 1304 | /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ |
| 1305 | #define AESECB6_INIT() \ |
| 1306 | do { \ |
| 1307 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1308 | bits0 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1309 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1310 | bits1 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1311 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1312 | bits2 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1313 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1314 | bits3 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1315 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1316 | bits4 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1317 | if (PTLS_LIKELY(srclen > 16 * 5) || src_vecleft != 0) { \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1318 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1319 | bits5 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1320 | } else { \ |
Kazuho Oku | 4f6bcae | 2022-05-09 07:02:13 +0900 | [diff] [blame] | 1321 | bits5 = ek0; \ |
| 1322 | state |= STATE_EK0_READY; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1323 | } \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1324 | __m128i k = ctx->super.ecb.keys.m128[0]; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1325 | bits0 = _mm_xor_si128(bits0, k); \ |
| 1326 | bits1 = _mm_xor_si128(bits1, k); \ |
| 1327 | bits2 = _mm_xor_si128(bits2, k); \ |
| 1328 | bits3 = _mm_xor_si128(bits3, k); \ |
| 1329 | bits4 = _mm_xor_si128(bits4, k); \ |
| 1330 | bits5 = _mm_xor_si128(bits5, k); \ |
| 1331 | } while (0) |
| 1332 | |
| 1333 | /* aes block update */ |
| 1334 | #define AESECB6_UPDATE(i) \ |
| 1335 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1336 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1337 | bits0 = _mm_aesenc_si128(bits0, k); \ |
| 1338 | bits1 = _mm_aesenc_si128(bits1, k); \ |
| 1339 | bits2 = _mm_aesenc_si128(bits2, k); \ |
| 1340 | bits3 = _mm_aesenc_si128(bits3, k); \ |
| 1341 | bits4 = _mm_aesenc_si128(bits4, k); \ |
| 1342 | bits5 = _mm_aesenc_si128(bits5, k); \ |
| 1343 | } while (0) |
| 1344 | |
| 1345 | /* aesenclast */ |
| 1346 | #define AESECB6_FINAL(i) \ |
| 1347 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1348 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1349 | bits0 = _mm_aesenclast_si128(bits0, k); \ |
| 1350 | bits1 = _mm_aesenclast_si128(bits1, k); \ |
| 1351 | bits2 = _mm_aesenclast_si128(bits2, k); \ |
| 1352 | bits3 = _mm_aesenclast_si128(bits3, k); \ |
| 1353 | bits4 = _mm_aesenclast_si128(bits4, k); \ |
| 1354 | bits5 = _mm_aesenclast_si128(bits5, k); \ |
| 1355 | } while (0) |
| 1356 | |
| 1357 | struct aesgcm_context *agctx = (void *)_ctx; |
Kazuho Oku | 7a0685d | 2022-05-04 15:39:57 +0900 | [diff] [blame] | 1358 | uint8_t *output = _output; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1359 | |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1360 | #define STATE_EK0_READY 0x1 |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1361 | #define STATE_COPY_128B 0x2 |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1362 | int32_t state = 0; |
| 1363 | |
Kazuho Oku | 93b04e1 | 2022-05-09 23:51:18 +0900 | [diff] [blame] | 1364 | /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. |
| 1365 | * Use of thread-local stroage is a hack. GCC spills a lot onto stack, and unless we move `encbuf` outside of stack, temporary |
| 1366 | * variables end up being stored onto somewhere not as fast as places near the stack pointer. This provides 18% speed |
| 1367 | * improvement on Core i5 9400, at the cost of 2% slowdown on Zen 3. */ |
| 1368 | #if defined(__GNUC__) && !defined(__clang__) |
| 1369 | static __thread |
| 1370 | #endif |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1371 | uint8_t encbuf[32 * 6] __attribute__((aligned(32))), |
| 1372 | *encp; |
Kazuho Oku | 122a334 | 2022-05-04 14:08:47 +0900 | [diff] [blame] | 1373 | |
| 1374 | /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is |
| 1375 | * append to the ciphertext before writing the bytes to main memory using NT store instructions. */ |
| 1376 | PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 16 + 16); |
| 1377 | |
Kazuho Oku | 13ced82 | 2022-05-10 10:09:54 +0900 | [diff] [blame^] | 1378 | /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */ |
| 1379 | encp = load_preceding_unaligned(encbuf, &output); |
| 1380 | |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1381 | /* First write would be 128 bytes (32+6*16), if encbuf contains no less than 32 bytes already. */ |
| 1382 | if (encp - encbuf >= 32) |
| 1383 | state |= STATE_COPY_128B; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1384 | |
| 1385 | /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */ |
| 1386 | __m128i ctr = calc_counter(agctx, seq); |
| 1387 | ctr = _mm_insert_epi32(ctr, 1, 0); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1388 | __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128); |
| 1389 | __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1390 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1391 | struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1392 | __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1393 | struct ptls_fusion_gfmul_state128 gstate = {0}; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1394 | |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1395 | /* find the first non-empty vec */ |
| 1396 | const uint8_t *src = NULL; |
| 1397 | size_t srclen = 0, src_vecleft = incnt; |
| 1398 | while (srclen == 0 && src_vecleft != 0) { |
| 1399 | src = (void *)input[0].base; |
| 1400 | srclen = input[0].len; |
| 1401 | ++input; |
| 1402 | --src_vecleft; |
| 1403 | } |
| 1404 | |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1405 | /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */ |
| 1406 | AESECB6_INIT(); |
| 1407 | AESECB6_UPDATE(1); |
| 1408 | AESECB6_UPDATE(2); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1409 | reduce_aad128(&gstate, ctx->ghash, aad, aadlen); |
| 1410 | for (size_t i = 3; i < ctx->super.ecb.rounds; ++i) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1411 | AESECB6_UPDATE(i); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1412 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1413 | |
| 1414 | /* Main loop. This loop: |
| 1415 | * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf, |
| 1416 | * 2. then if there is no more data to be encrypted, exit the loop, otherwise, |
| 1417 | * 3. calculate ghash of the blocks being written to encbuf, |
| 1418 | * 4. calculate next 6 * 16 bytes of keystream, |
| 1419 | * 5. writes encbuf in 64-byte blocks |
| 1420 | * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be |
| 1421 | * calculated. */ |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1422 | size_t remaining_ghash_from = encp - encbuf; |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1423 | if (srclen != 0) { |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1424 | while (1) { |
| 1425 | /* apply the bit stream to input, writing to encbuf */ |
| 1426 | if (PTLS_LIKELY(srclen >= 6 * 16)) { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1427 | #define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(src + i * 16)), bits##i)) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1428 | APPLY(0); |
| 1429 | APPLY(1); |
| 1430 | APPLY(2); |
| 1431 | APPLY(3); |
| 1432 | APPLY(4); |
| 1433 | APPLY(5); |
| 1434 | #undef APPLY |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1435 | encp += 6 * 16; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1436 | src += 6 * 16; |
| 1437 | srclen -= 6 * 16; |
| 1438 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1439 | if (src_vecleft == 0) { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1440 | remaining_ghash_from = (encp - encbuf) - 96; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1441 | break; |
| 1442 | } |
| 1443 | src = (void *)input[0].base; |
| 1444 | srclen = input[0].len; |
| 1445 | ++input; |
| 1446 | --src_vecleft; |
| 1447 | } |
| 1448 | } else { |
| 1449 | /* slow path, load at most 6 * 16 bytes to encbuf then encrypt in-place */ |
| 1450 | size_t bytes_copied = 0; |
| 1451 | do { |
Kazuho Oku | fa3cd32 | 2022-05-08 17:05:17 +0900 | [diff] [blame] | 1452 | if (srclen >= 16 && bytes_copied < 5 * 16) { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1453 | _mm_storeu_si128((void *)(encp + bytes_copied), _mm_loadu_si128((void *)src)); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1454 | bytes_copied += 16; |
| 1455 | src += 16; |
| 1456 | srclen -= 16; |
| 1457 | } else { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1458 | encp[bytes_copied++] = *src++; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1459 | --srclen; |
| 1460 | } |
| 1461 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1462 | if (src_vecleft == 0) { |
| 1463 | break; |
| 1464 | } else { |
| 1465 | src = (void *)input[0].base; |
| 1466 | srclen = input[0].len; |
| 1467 | ++input; |
| 1468 | --src_vecleft; |
| 1469 | } |
| 1470 | } |
| 1471 | } while (bytes_copied < 6 * 16); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1472 | #define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(encp + i * 16)), bits##i)) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1473 | APPLY(0); |
| 1474 | APPLY(1); |
| 1475 | APPLY(2); |
| 1476 | APPLY(3); |
| 1477 | APPLY(4); |
| 1478 | APPLY(5); |
| 1479 | #undef APPLY |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1480 | encp += bytes_copied; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1481 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1482 | /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it |
| 1483 | * will be fed into ghash. */ |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1484 | remaining_ghash_from = (encp - encbuf) - bytes_copied; |
Kazuho Oku | 7fb163f | 2022-05-01 14:09:56 +0900 | [diff] [blame] | 1485 | if ((bytes_copied & 15) != 0) |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1486 | _mm_storeu_si128((void *)encp, _mm_setzero_si128()); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1487 | break; |
| 1488 | } |
| 1489 | } |
| 1490 | |
| 1491 | /* Next 96-byte block starts here. Run AES and ghash in while writing output using non-temporal stores in 64-byte |
| 1492 | * blocks. */ |
| 1493 | AESECB6_INIT(); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1494 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + 6; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1495 | gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encp - 6 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1496 | AESECB6_UPDATE(1); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1497 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 5 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1498 | AESECB6_UPDATE(2); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1499 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 4 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1500 | AESECB6_UPDATE(3); |
Kazuho Oku | 7da0917 | 2022-04-30 23:54:59 +0900 | [diff] [blame] | 1501 | _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf)); |
| 1502 | _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32))); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1503 | AESECB6_UPDATE(4); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1504 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 3 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1505 | AESECB6_UPDATE(5); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1506 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 2 * 16)), --ghash_precompute); |
Kazuho Oku | b854db9 | 2022-04-30 23:44:34 +0900 | [diff] [blame] | 1507 | AESECB6_UPDATE(6); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1508 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 1 * 16)), --ghash_precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1509 | AESECB6_UPDATE(7); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1510 | if ((state & STATE_COPY_128B) != 0) { |
Kazuho Oku | 7da0917 | 2022-04-30 23:54:59 +0900 | [diff] [blame] | 1511 | _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64))); |
| 1512 | _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96))); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1513 | output += 128; |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1514 | encp -= 128; |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1515 | AESECB6_UPDATE(8); |
Kazuho Oku | 7a0685d | 2022-05-04 15:39:57 +0900 | [diff] [blame] | 1516 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 128))); |
| 1517 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 160))); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1518 | } else { |
| 1519 | output += 64; |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1520 | encp -= 64; |
Kazuho Oku | 7a0685d | 2022-05-04 15:39:57 +0900 | [diff] [blame] | 1521 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 64))); |
| 1522 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 96))); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1523 | AESECB6_UPDATE(8); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1524 | } |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1525 | state ^= STATE_COPY_128B; |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1526 | AESECB6_UPDATE(9); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1527 | if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) { |
| 1528 | for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i) |
Kazuho Oku | 59983e9 | 2022-05-02 12:04:47 +0900 | [diff] [blame] | 1529 | AESECB6_UPDATE(i); |
| 1530 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1531 | assert(ctx->ghash == ghash_precompute); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1532 | gfmul_reduce128(&gstate); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1533 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1534 | } |
| 1535 | } |
| 1536 | |
| 1537 | /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */ |
| 1538 | |
| 1539 | { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7 |
| 1540 | * blocks at once. */ |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1541 | size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16; |
Kazuho Oku | 7fb163f | 2022-05-01 14:09:56 +0900 | [diff] [blame] | 1542 | _mm_storeu_si128((void *)(encbuf + ac_off), ac); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1543 | size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1544 | assert(blocks <= 7); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1545 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + blocks; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1546 | gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1547 | remaining_ghash_from += 16; |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1548 | while (ghash_precompute != ctx->ghash) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1549 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1550 | remaining_ghash_from += 16; |
| 1551 | } |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1552 | gfmul_reduce128(&gstate); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1553 | } |
| 1554 | |
| 1555 | /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */ |
| 1556 | if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1557 | bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]); |
| 1558 | for (size_t i = 1; i < ctx->super.ecb.rounds; ++i) |
| 1559 | bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]); |
| 1560 | bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1561 | } |
| 1562 | |
| 1563 | /* append tag to encbuf */ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1564 | _mm_storeu_si128((void *)encp, gfmul_get_tag128(&gstate, bits5)); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1565 | encp += 16; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1566 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1567 | /* write remaining bytes */ |
| 1568 | write_remaining_bytes(output, encbuf, encp); |
| 1569 | |
| 1570 | #undef AESECB6_INIT |
| 1571 | #undef AESECB6_UPDATE |
| 1572 | #undef AESECB6_FINAL |
| 1573 | #undef STATE_EK0_READY |
| 1574 | #undef STATE_COPY_128B |
| 1575 | } |
| 1576 | |
| 1577 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1578 | static void non_temporal_encrypt_v256(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt, |
| 1579 | uint64_t seq, const void *_aad, size_t aadlen) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1580 | { |
| 1581 | /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ |
| 1582 | #define AESECB6_INIT() \ |
| 1583 | do { \ |
| 1584 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1585 | bits0 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1586 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1587 | bits1 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1588 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1589 | bits2 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1590 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1591 | bits3 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1592 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1593 | bits4 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1594 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1595 | bits5 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1596 | if (PTLS_UNLIKELY(srclen <= 32 * 6 - 16) && src_vecleft == 0) { \ |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 1597 | bits5 = _mm256_permute2f128_si256(bits5, ac_ek0, 0x30); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1598 | state |= STATE_EK0_READY; \ |
| 1599 | } \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1600 | __m256i k = ctx->super.ecb.keys.m256[0]; \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1601 | bits0 = _mm256_xor_si256(bits0, k); \ |
| 1602 | bits1 = _mm256_xor_si256(bits1, k); \ |
| 1603 | bits2 = _mm256_xor_si256(bits2, k); \ |
| 1604 | bits3 = _mm256_xor_si256(bits3, k); \ |
| 1605 | bits4 = _mm256_xor_si256(bits4, k); \ |
| 1606 | bits5 = _mm256_xor_si256(bits5, k); \ |
| 1607 | } while (0) |
| 1608 | |
| 1609 | /* aes block update */ |
| 1610 | #define AESECB6_UPDATE(i) \ |
| 1611 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1612 | __m256i k = ctx->super.ecb.keys.m256[i]; \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1613 | bits0 = _mm256_aesenc_epi128(bits0, k); \ |
| 1614 | bits1 = _mm256_aesenc_epi128(bits1, k); \ |
| 1615 | bits2 = _mm256_aesenc_epi128(bits2, k); \ |
| 1616 | bits3 = _mm256_aesenc_epi128(bits3, k); \ |
| 1617 | bits4 = _mm256_aesenc_epi128(bits4, k); \ |
| 1618 | bits5 = _mm256_aesenc_epi128(bits5, k); \ |
| 1619 | } while (0) |
| 1620 | |
| 1621 | /* aesenclast */ |
| 1622 | #define AESECB6_FINAL(i) \ |
| 1623 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1624 | __m256i k = ctx->super.ecb.keys.m256[i]; \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1625 | bits0 = _mm256_aesenclast_epi128(bits0, k); \ |
| 1626 | bits1 = _mm256_aesenclast_epi128(bits1, k); \ |
| 1627 | bits2 = _mm256_aesenclast_epi128(bits2, k); \ |
| 1628 | bits3 = _mm256_aesenclast_epi128(bits3, k); \ |
| 1629 | bits4 = _mm256_aesenclast_epi128(bits4, k); \ |
| 1630 | bits5 = _mm256_aesenclast_epi128(bits5, k); \ |
| 1631 | } while (0) |
| 1632 | |
| 1633 | struct aesgcm_context *agctx = (void *)_ctx; |
| 1634 | uint8_t *output = _output; |
| 1635 | const uint8_t *aad = _aad; |
| 1636 | |
| 1637 | #define STATE_EK0_READY 0x1 |
| 1638 | int32_t state = 0; |
| 1639 | |
| 1640 | /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */ |
| 1641 | uint8_t encbuf[32 * 9] __attribute__((aligned(32))), *encp; |
| 1642 | |
| 1643 | /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is |
| 1644 | * append to the ciphertext before writing the bytes to main memory using NT store instructions. */ |
| 1645 | PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 32 + 16); |
| 1646 | |
Kazuho Oku | 13ced82 | 2022-05-10 10:09:54 +0900 | [diff] [blame^] | 1647 | /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */ |
| 1648 | encp = load_preceding_unaligned(encbuf, &output); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1649 | |
| 1650 | /* setup ctr, retaining Ek(0), len(A) | len(C) to be fed into GCM */ |
| 1651 | __m256i ctr = _mm256_broadcastsi128_si256(calc_counter(agctx, seq)); |
| 1652 | ctr = _mm256_insert_epi32(ctr, 1, 4); |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 1653 | __m256i ac_ek0 = _mm256_permute2f128_si256( |
| 1654 | /* first half: ac */ |
| 1655 | _mm256_castsi128_si256( |
| 1656 | _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128)), |
| 1657 | /* second half: ek0 */ |
| 1658 | _mm256_shuffle_epi8(ctr, byteswap256), 0x30); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1659 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1660 | struct ptls_fusion_aesgcm_context256 *ctx = (void *)agctx->aesgcm; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1661 | __m256i bits0, bits1, bits2, bits3, bits4, bits5 = _mm256_setzero_si256(); |
| 1662 | struct ptls_fusion_gfmul_state256 gstate = {0}; |
| 1663 | |
| 1664 | /* find the first non-empty vec */ |
| 1665 | const uint8_t *src = NULL; |
| 1666 | size_t srclen = 0, src_vecleft = incnt; |
| 1667 | while (srclen == 0 && src_vecleft != 0) { |
| 1668 | src = (void *)input[0].base; |
| 1669 | srclen = input[0].len; |
| 1670 | ++input; |
| 1671 | --src_vecleft; |
| 1672 | } |
| 1673 | |
| 1674 | /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */ |
| 1675 | AESECB6_INIT(); |
| 1676 | AESECB6_UPDATE(1); |
| 1677 | AESECB6_UPDATE(2); |
| 1678 | if (PTLS_LIKELY(aadlen != 0)) { |
| 1679 | union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute; |
| 1680 | while (PTLS_UNLIKELY(aadlen >= 6 * 32)) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1681 | ghash_precompute = ctx->ghash + 6; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1682 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute); |
| 1683 | aad += 32; |
| 1684 | aadlen -= 32; |
| 1685 | for (int i = 1; i < 6; ++i) { |
| 1686 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute); |
| 1687 | aad += 32; |
| 1688 | aadlen -= 32; |
| 1689 | } |
| 1690 | gfmul_reduce256(&gstate); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1691 | } |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1692 | if (PTLS_LIKELY(aadlen != 0)) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1693 | ghash_precompute = ctx->ghash + (aadlen + 31) / 32; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1694 | if (PTLS_UNLIKELY(aadlen >= 32)) { |
| 1695 | if (aadlen % 32 == 0 || aadlen % 32 > 16) { |
| 1696 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute); |
| 1697 | aad += 32; |
| 1698 | aadlen -= 32; |
| 1699 | } else { |
| 1700 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 1, --ghash_precompute); |
| 1701 | aad += 16; |
| 1702 | aadlen -= 16; |
| 1703 | } |
| 1704 | while (aadlen >= 32) { |
| 1705 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute); |
| 1706 | aad += 32; |
| 1707 | aadlen -= 32; |
| 1708 | } |
| 1709 | if (PTLS_LIKELY(aadlen != 0)) { |
| 1710 | assert(aadlen > 16); |
| 1711 | gfmul_nextstep256(&gstate, loadn256(aad, aadlen), --ghash_precompute); |
| 1712 | } |
| 1713 | } else { |
| 1714 | gfmul_firststep256(&gstate, loadn256(aad, aadlen), aadlen <= 16, --ghash_precompute); |
| 1715 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1716 | assert(ctx->ghash == ghash_precompute); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1717 | gfmul_reduce256(&gstate); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1718 | } |
| 1719 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1720 | for (size_t i = 3; i < ctx->super.ecb.rounds; ++i) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1721 | AESECB6_UPDATE(i); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1722 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1723 | |
| 1724 | /* Main loop. This loop: |
| 1725 | * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf, |
| 1726 | * 2. then if there is no more data to be encrypted, exit the loop, otherwise, |
| 1727 | * 3. calculate ghash of the blocks being written to encbuf, |
| 1728 | * 4. calculate next 6 * 16 bytes of keystream, |
| 1729 | * 5. writes encbuf in 64-byte blocks |
| 1730 | * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be |
| 1731 | * calculated. */ |
| 1732 | size_t remaining_ghash_from = encp - encbuf; |
| 1733 | if (srclen != 0) { |
| 1734 | while (1) { |
| 1735 | /* apply the bit stream to input, writing to encbuf */ |
| 1736 | if (PTLS_LIKELY(srclen >= 6 * 32)) { |
| 1737 | #define APPLY(i) _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(src + i * 32)), bits##i)) |
| 1738 | APPLY(0); |
| 1739 | APPLY(1); |
| 1740 | APPLY(2); |
| 1741 | APPLY(3); |
| 1742 | APPLY(4); |
| 1743 | APPLY(5); |
| 1744 | #undef APPLY |
| 1745 | encp += 6 * 32; |
| 1746 | src += 6 * 32; |
| 1747 | srclen -= 6 * 32; |
| 1748 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1749 | if (src_vecleft == 0) { |
| 1750 | remaining_ghash_from = (encp - encbuf) - 6 * 32; |
| 1751 | break; |
| 1752 | } |
| 1753 | src = (void *)input[0].base; |
| 1754 | srclen = input[0].len; |
| 1755 | ++input; |
| 1756 | --src_vecleft; |
| 1757 | } |
| 1758 | } else { |
| 1759 | /* slow path, load at most 6 * 32 bytes to encbuf then encrypt in-place */ |
| 1760 | size_t bytes_copied = 0; |
| 1761 | do { |
| 1762 | if (srclen >= 32 && bytes_copied < 5 * 32) { |
| 1763 | _mm256_storeu_si256((void *)(encp + bytes_copied), _mm256_loadu_si256((void *)src)); |
| 1764 | bytes_copied += 32; |
| 1765 | src += 32; |
| 1766 | srclen -= 32; |
| 1767 | } else { |
| 1768 | encp[bytes_copied++] = *src++; |
| 1769 | --srclen; |
| 1770 | } |
| 1771 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1772 | if (src_vecleft == 0) { |
| 1773 | break; |
| 1774 | } else { |
| 1775 | src = (void *)input[0].base; |
| 1776 | srclen = input[0].len; |
| 1777 | ++input; |
| 1778 | --src_vecleft; |
| 1779 | } |
| 1780 | } |
| 1781 | } while (bytes_copied < 6 * 32); |
| 1782 | #define APPLY(i) \ |
| 1783 | _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(encp + i * 32)), bits##i)) |
| 1784 | APPLY(0); |
| 1785 | APPLY(1); |
| 1786 | APPLY(2); |
| 1787 | APPLY(3); |
| 1788 | APPLY(4); |
| 1789 | APPLY(5); |
| 1790 | #undef APPLY |
| 1791 | encp += bytes_copied; |
| 1792 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1793 | /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it |
| 1794 | * will be fed into ghash. */ |
| 1795 | remaining_ghash_from = (encp - encbuf) - bytes_copied; |
| 1796 | if ((bytes_copied & 15) != 0) |
| 1797 | _mm_storeu_si128((void *)encp, _mm_setzero_si128()); |
| 1798 | break; |
| 1799 | } |
| 1800 | } |
| 1801 | |
| 1802 | /* Next 96-byte block starts here. Run AES and ghash in parallel while writing output using non-temporal store |
| 1803 | * instructions. */ |
| 1804 | AESECB6_INIT(); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1805 | union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + 6; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1806 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encp - 6 * 32)), 0, --ghash_precompute); |
| 1807 | AESECB6_UPDATE(1); |
| 1808 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 5 * 32)), --ghash_precompute); |
| 1809 | AESECB6_UPDATE(2); |
| 1810 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 4 * 32)), --ghash_precompute); |
| 1811 | AESECB6_UPDATE(3); |
| 1812 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 3 * 32)), --ghash_precompute); |
| 1813 | AESECB6_UPDATE(4); |
| 1814 | _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf)); |
| 1815 | _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32))); |
| 1816 | _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64))); |
| 1817 | _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96))); |
| 1818 | _mm256_stream_si256((void *)(output + 128), _mm256_load_si256((void *)(encbuf + 128))); |
| 1819 | _mm256_stream_si256((void *)(output + 160), _mm256_load_si256((void *)(encbuf + 160))); |
| 1820 | AESECB6_UPDATE(5); |
| 1821 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 2 * 32)), --ghash_precompute); |
| 1822 | AESECB6_UPDATE(6); |
| 1823 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 1 * 32)), --ghash_precompute); |
| 1824 | output += 192; |
| 1825 | encp -= 192; |
| 1826 | AESECB6_UPDATE(7); |
| 1827 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 192))); |
| 1828 | AESECB6_UPDATE(8); |
| 1829 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 224))); |
| 1830 | AESECB6_UPDATE(9); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1831 | if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) { |
| 1832 | for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1833 | AESECB6_UPDATE(i); |
| 1834 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1835 | assert(ctx->ghash == ghash_precompute); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1836 | gfmul_reduce256(&gstate); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1837 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1838 | } |
| 1839 | } |
| 1840 | |
| 1841 | /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */ |
| 1842 | |
| 1843 | { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7 |
| 1844 | * blocks at once. */ |
| 1845 | size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16; |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 1846 | _mm_storeu_si128((void *)(encbuf + ac_off), _mm256_castsi256_si128(ac_ek0)); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1847 | size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */ |
| 1848 | assert(blocks <= 13); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1849 | union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + blocks / 2; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1850 | if (blocks % 2 != 0) { |
| 1851 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 1, ghash_precompute); |
| 1852 | remaining_ghash_from += 16; |
| 1853 | } else { |
| 1854 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 0, --ghash_precompute); |
| 1855 | remaining_ghash_from += 32; |
| 1856 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1857 | while (ghash_precompute != ctx->ghash) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1858 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), --ghash_precompute); |
| 1859 | remaining_ghash_from += 32; |
| 1860 | } |
| 1861 | gfmul_reduce256(&gstate); |
| 1862 | } |
| 1863 | |
| 1864 | /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */ |
| 1865 | if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) { |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 1866 | bits5 = ac_ek0; |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1867 | bits5 = _mm256_xor_si256(bits5, ctx->super.ecb.keys.m256[0]); |
| 1868 | for (size_t i = 1; i < ctx->super.ecb.rounds; ++i) |
| 1869 | bits5 = _mm256_aesenc_epi128(bits5, ctx->super.ecb.keys.m256[i]); |
| 1870 | bits5 = _mm256_aesenclast_epi128(bits5, ctx->super.ecb.keys.m256[ctx->super.ecb.rounds]); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1871 | } |
| 1872 | |
| 1873 | /* append tag to encbuf */ |
| 1874 | _mm_storeu_si128((void *)encp, |
| 1875 | gfmul_get_tag256(&gstate, _mm256_castsi256_si128(_mm256_permute2f128_si256(bits5, bits5, 0x11)))); |
| 1876 | encp += 16; |
| 1877 | |
| 1878 | /* write remaining bytes */ |
| 1879 | write_remaining_bytes(output, encbuf, encp); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1880 | } |
| 1881 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1882 | static int nt_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1883 | { |
| 1884 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
| 1885 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1886 | ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE); |
| 1887 | ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1888 | if (key == NULL) |
| 1889 | return 0; |
| 1890 | |
| 1891 | ctx->super.dispose_crypto = aesgcm_dispose_crypto; |
| 1892 | ctx->super.do_xor_iv = aesgcm_xor_iv; |
| 1893 | ctx->super.do_encrypt = ptls_aead__do_encrypt; |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1894 | ctx->super.do_encrypt_v = ptls_fusion_can_avx256 ? non_temporal_encrypt_v256 : non_temporal_encrypt_v128; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1895 | ctx->super.do_decrypt = NULL; /* FIXME */ |
| 1896 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1897 | ctx->aesgcm = new_aesgcm(key, key_size, |
| 1898 | 7 * (ptls_fusion_can_avx256 ? 32 : 16), // 6 blocks at once, plus len(A) | len(C) that we might append |
| 1899 | ptls_fusion_can_avx256); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1900 | |
| 1901 | return 0; |
| 1902 | } |
| 1903 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1904 | static int non_temporal_aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1905 | { |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1906 | return nt_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1907 | } |
| 1908 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1909 | static int non_temporal_aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1910 | { |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1911 | return nt_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1912 | } |
| 1913 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1914 | ptls_aead_algorithm_t ptls_non_temporal_aes128gcm = {"AES128-GCM", |
| 1915 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 1916 | PTLS_AESGCM_INTEGRITY_LIMIT, |
| 1917 | &ptls_fusion_aes128ctr, |
| 1918 | NULL, // &ptls_fusion_aes128ecb, |
| 1919 | PTLS_AES128_KEY_SIZE, |
| 1920 | PTLS_AESGCM_IV_SIZE, |
| 1921 | PTLS_AESGCM_TAG_SIZE, |
| 1922 | sizeof(struct aesgcm_context), |
| 1923 | non_temporal_aes128gcm_setup}; |
| 1924 | ptls_aead_algorithm_t ptls_non_temporal_aes256gcm = {"AES256-GCM", |
| 1925 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 1926 | PTLS_AESGCM_INTEGRITY_LIMIT, |
| 1927 | &ptls_fusion_aes256ctr, |
| 1928 | NULL, // &ptls_fusion_aes128ecb, |
| 1929 | PTLS_AES256_KEY_SIZE, |
| 1930 | PTLS_AESGCM_IV_SIZE, |
| 1931 | PTLS_AESGCM_TAG_SIZE, |
| 1932 | sizeof(struct aesgcm_context), |
| 1933 | non_temporal_aes256gcm_setup}; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1934 | |
Christian Huitema | 3c3e3f2 | 2020-06-23 15:49:15 -0700 | [diff] [blame] | 1935 | #ifdef _WINDOWS |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 1936 | /** |
| 1937 | * ptls_fusion_is_supported_by_cpu: |
| 1938 | * Check that the CPU has extended instructions for PCMUL, AES and AVX2. |
| 1939 | * This test assumes that the CPU is following the x86/x64 architecture. |
| 1940 | * A slightly more refined test could check that the cpu_info spells out |
| 1941 | * "genuineIntel" or "authenticAMD", but would fail in presence of |
| 1942 | * little known CPU brands or some VM */ |
Christian Huitema | c17ef18 | 2020-06-22 20:41:45 -0700 | [diff] [blame] | 1943 | int ptls_fusion_is_supported_by_cpu(void) |
| 1944 | { |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 1945 | uint32_t cpu_info[4]; |
| 1946 | uint32_t nb_ids; |
| 1947 | int is_supported = 0; |
| 1948 | |
| 1949 | __cpuid(cpu_info, 0); |
| 1950 | nb_ids = cpu_info[0]; |
| 1951 | |
| 1952 | if (nb_ids >= 7) { |
| 1953 | uint32_t leaf1_ecx; |
| 1954 | __cpuid(cpu_info, 1); |
| 1955 | leaf1_ecx = cpu_info[2]; |
Kazuho Oku | 14c00c0 | 2020-09-12 20:48:25 +0900 | [diff] [blame] | 1956 | |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 1957 | if (/* PCLMUL */ (leaf1_ecx & (1 << 5)) != 0 && /* AES */ (leaf1_ecx & (1 << 25)) != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1958 | uint32_t leaf7_ebx, leaf7_ecx; |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 1959 | __cpuid(cpu_info, 7); |
| 1960 | leaf7_ebx = cpu_info[1]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1961 | leaf7_ecx = cpu_info[2]; |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 1962 | |
| 1963 | is_supported = /* AVX2 */ (leaf7_ebx & (1 << 5)) != 0; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1964 | |
| 1965 | /* enable 256-bit mode if possible */ |
Kazuho Oku | 7f165e0 | 2022-05-10 07:04:06 +0900 | [diff] [blame] | 1966 | if (is_supported && (leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_avx256) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1967 | ptls_fusion_can_avx256 = 1; |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 1968 | } |
| 1969 | } |
| 1970 | |
| 1971 | return is_supported; |
Christian Huitema | c17ef18 | 2020-06-22 20:41:45 -0700 | [diff] [blame] | 1972 | } |
| 1973 | #else |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 1974 | int ptls_fusion_is_supported_by_cpu(void) |
| 1975 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1976 | unsigned leaf1_ecx, leaf7_ebx, leaf7_ecx; |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 1977 | |
| 1978 | { /* GCC-specific code to obtain CPU features */ |
Kazuho Oku | efce043 | 2020-05-15 04:51:58 +0900 | [diff] [blame] | 1979 | unsigned leaf_cnt; |
| 1980 | __asm__("cpuid" : "=a"(leaf_cnt) : "a"(0) : "ebx", "ecx", "edx"); |
| 1981 | if (leaf_cnt < 7) |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 1982 | return 0; |
Kazuho Oku | efce043 | 2020-05-15 04:51:58 +0900 | [diff] [blame] | 1983 | __asm__("cpuid" : "=c"(leaf1_ecx) : "a"(1) : "ebx", "edx"); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1984 | __asm__("cpuid" : "=b"(leaf7_ebx), "=c"(leaf7_ecx) : "a"(7), "c"(0) : "edx"); |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 1985 | } |
| 1986 | |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 1987 | /* AVX2 */ |
| 1988 | if ((leaf7_ebx & (1 << 5)) == 0) |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 1989 | return 0; |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 1990 | /* AES */ |
| 1991 | if ((leaf1_ecx & (1 << 25)) == 0) |
| 1992 | return 0; |
| 1993 | /* PCLMUL */ |
| 1994 | if ((leaf1_ecx & (1 << 1)) == 0) |
| 1995 | return 0; |
| 1996 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1997 | /* enable 256-bit mode if possible */ |
| 1998 | if ((leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_avx256) |
| 1999 | ptls_fusion_can_avx256 = 1; |
| 2000 | |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 2001 | return 1; |
| 2002 | } |
Christian Huitema | c17ef18 | 2020-06-22 20:41:45 -0700 | [diff] [blame] | 2003 | #endif |