Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1 | /* |
| 2 | * This source file is licensed under the Apache License 2.0 *and* the MIT |
| 3 | * License. Please agree to *both* of the licensing terms! |
| 4 | * |
| 5 | * |
| 6 | * `transformH` function is a derivative work of OpenSSL. The original work |
| 7 | * is covered by the following license: |
| 8 | * |
| 9 | * Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. |
| 10 | * |
| 11 | * Licensed under the Apache License 2.0 (the "License"). You may not use |
| 12 | * this file except in compliance with the License. You can obtain a copy |
| 13 | * in the file LICENSE in the source distribution or at |
| 14 | * https://www.openssl.org/source/license.html |
| 15 | * |
| 16 | * |
| 17 | * All other work, including modifications to the `transformH` function is |
| 18 | * covered by the following MIT license: |
| 19 | * |
Kazuho Oku | d1a0912 | 2022-05-09 16:06:36 +0900 | [diff] [blame] | 20 | * Copyright (c) 2020-2022 Fastly, Kazuho Oku |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 21 | * |
| 22 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 23 | * of this software and associated documentation files (the "Software"), to |
| 24 | * deal in the Software without restriction, including without limitation the |
| 25 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 26 | * sell copies of the Software, and to permit persons to whom the Software is |
| 27 | * furnished to do so, subject to the following conditions: |
| 28 | * |
| 29 | * The above copyright notice and this permission notice shall be included in |
| 30 | * all copies or substantial portions of the Software. |
| 31 | * |
| 32 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 33 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 34 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 35 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 36 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 37 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| 38 | * IN THE SOFTWARE. |
| 39 | */ |
| 40 | #include <stdint.h> |
Kazuho Oku | 14c00c0 | 2020-09-12 20:48:25 +0900 | [diff] [blame] | 41 | |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 42 | #include <stdlib.h> |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 43 | #include <string.h> |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 44 | #include <immintrin.h> |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 45 | #include <tmmintrin.h> |
Kazuho Oku | 02ca0f0 | 2020-05-13 20:46:39 +0900 | [diff] [blame] | 46 | #include <nmmintrin.h> |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 47 | #include <wmmintrin.h> |
| 48 | #include "picotls.h" |
| 49 | #include "picotls/fusion.h" |
| 50 | |
Kazuho Oku | 8b9cd57 | 2022-05-04 13:44:33 +0900 | [diff] [blame] | 51 | #if defined(__clang__) |
| 52 | #if __has_feature(address_sanitizer) |
| 53 | #define NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address"))) |
| 54 | #endif |
| 55 | #elif __SANITIZE_ADDRESS__ /* gcc */ |
| 56 | #define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address)) |
| 57 | #endif |
| 58 | #ifndef NO_SANITIZE_ADDRESS |
| 59 | #define NO_SANITIZE_ADDRESS |
| 60 | #endif |
| 61 | |
Kazuho Oku | eb3ebf4 | 2022-05-10 07:06:17 +0900 | [diff] [blame] | 62 | #ifdef _WINDOWS |
| 63 | #define aligned_alloc(a, s) _aligned_malloc((s), (a)) |
Kazuho Oku | ba56a5d | 2022-12-12 09:24:11 +0900 | [diff] [blame] | 64 | #define aligned_free(p) _aligned_free(p) |
| 65 | #else |
| 66 | #define aligned_free(p) free(p) |
Kazuho Oku | eb3ebf4 | 2022-05-10 07:06:17 +0900 | [diff] [blame] | 67 | #endif |
| 68 | |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 69 | struct ptls_fusion_aesgcm_context { |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 70 | ptls_fusion_aesecb_context_t ecb; |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 71 | size_t capacity; |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 72 | size_t ghash_cnt; |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 73 | }; |
| 74 | |
| 75 | struct ptls_fusion_aesgcm_context128 { |
| 76 | struct ptls_fusion_aesgcm_context super; |
| 77 | struct ptls_fusion_aesgcm_ghash_precompute128 { |
| 78 | __m128i H; |
| 79 | __m128i r; |
| 80 | } ghash[0]; |
| 81 | }; |
| 82 | |
| 83 | struct ptls_fusion_aesgcm_context256 { |
| 84 | struct ptls_fusion_aesgcm_context super; |
| 85 | union ptls_fusion_aesgcm_ghash_precompute256 { |
| 86 | struct { |
| 87 | __m128i H[2]; |
| 88 | __m128i r[2]; |
| 89 | }; |
| 90 | struct { |
| 91 | __m256i Hx2; |
| 92 | __m256i rx2; |
| 93 | }; |
| 94 | } ghash[0]; |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 95 | }; |
| 96 | |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 97 | struct ctr_context { |
| 98 | ptls_cipher_context_t super; |
| 99 | ptls_fusion_aesecb_context_t fusion; |
| 100 | __m128i bits; |
| 101 | uint8_t is_ready; |
| 102 | }; |
| 103 | |
| 104 | struct aesgcm_context { |
| 105 | ptls_aead_context_t super; |
| 106 | ptls_fusion_aesgcm_context_t *aesgcm; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 107 | /** |
| 108 | * retains the static IV in the upper 96 bits (in little endian) |
| 109 | */ |
| 110 | __m128i static_iv; |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 111 | }; |
| 112 | |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 113 | static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000}; |
| 114 | #define poly (*(__m128i *)poly_) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 115 | static const uint8_t byteswap_[32] __attribute__((aligned(32))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, |
| 116 | 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; |
| 117 | #define byteswap128 (*(__m128i *)byteswap_) |
| 118 | #define byteswap256 (*(__m256i *)byteswap_) |
| 119 | static const uint8_t one_[16] __attribute__((aligned(16))) = {1}; |
| 120 | #define one8 (*(__m128i *)one_) |
| 121 | static const uint8_t incr128x2_[32] __attribute__((aligned(32))) = {2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}; |
| 122 | #define incr128x2 (*(__m256i *)incr128x2_) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 123 | |
Kazuho Oku | ae95e4c | 2020-05-11 06:27:27 +0900 | [diff] [blame] | 124 | /* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl |
| 125 | * at commit 33388b4. */ |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 126 | static __m128i transformH(__m128i H) |
| 127 | { |
| 128 | // # <<1 twist |
| 129 | // pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword |
| 130 | __m128i t2 = _mm_shuffle_epi32(H, 0xff); |
| 131 | // movdqa $Hkey,$T1 |
| 132 | __m128i t1 = H; |
| 133 | // psllq \$1,$Hkey |
| 134 | H = _mm_slli_epi64(H, 1); |
| 135 | // pxor $T3,$T3 # |
| 136 | __m128i t3 = _mm_setzero_si128(); |
| 137 | // psrlq \$63,$T1 |
| 138 | t1 = _mm_srli_epi64(t1, 63); |
| 139 | // pcmpgtd $T2,$T3 # broadcast carry bit |
| 140 | t3 = _mm_cmplt_epi32(t2, t3); |
| 141 | // pslldq \$8,$T1 |
| 142 | t1 = _mm_slli_si128(t1, 8); |
| 143 | // por $T1,$Hkey # H<<=1 |
| 144 | H = _mm_or_si128(t1, H); |
| 145 | |
| 146 | // # magic reduction |
| 147 | // pand .L0x1c2_polynomial(%rip),$T3 |
| 148 | t3 = _mm_and_si128(t3, poly); |
| 149 | // pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial |
| 150 | H = _mm_xor_si128(t3, H); |
| 151 | |
| 152 | return H; |
| 153 | } |
| 154 | // end of Apache License code |
| 155 | |
| 156 | static __m128i gfmul(__m128i x, __m128i y) |
| 157 | { |
| 158 | __m128i lo = _mm_clmulepi64_si128(x, y, 0x00); |
| 159 | __m128i hi = _mm_clmulepi64_si128(x, y, 0x11); |
| 160 | |
| 161 | __m128i a = _mm_shuffle_epi32(x, 78); |
| 162 | __m128i b = _mm_shuffle_epi32(y, 78); |
| 163 | a = _mm_xor_si128(a, x); |
| 164 | b = _mm_xor_si128(b, y); |
| 165 | |
| 166 | a = _mm_clmulepi64_si128(a, b, 0x00); |
| 167 | a = _mm_xor_si128(a, lo); |
| 168 | a = _mm_xor_si128(a, hi); |
| 169 | |
| 170 | b = _mm_slli_si128(a, 8); |
| 171 | a = _mm_srli_si128(a, 8); |
| 172 | |
| 173 | lo = _mm_xor_si128(lo, b); |
| 174 | hi = _mm_xor_si128(hi, a); |
| 175 | |
| 176 | // from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf |
| 177 | __m128i t = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 178 | lo = _mm_shuffle_epi32(lo, 78); |
| 179 | lo = _mm_xor_si128(lo, t); |
| 180 | t = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 181 | lo = _mm_shuffle_epi32(lo, 78); |
| 182 | lo = _mm_xor_si128(lo, t); |
| 183 | |
| 184 | return _mm_xor_si128(hi, lo); |
| 185 | } |
| 186 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 187 | static inline __m128i gfmul_do_reduce(__m128i hi, __m128i lo, __m128i mid) |
| 188 | { |
| 189 | mid = _mm_xor_si128(mid, hi); |
| 190 | mid = _mm_xor_si128(mid, lo); |
| 191 | lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8)); |
| 192 | hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8)); |
| 193 | |
| 194 | /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */ |
| 195 | __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 196 | lo = _mm_shuffle_epi32(lo, 78); |
| 197 | lo = _mm_xor_si128(lo, r); |
| 198 | r = _mm_clmulepi64_si128(lo, poly, 0x10); |
| 199 | lo = _mm_shuffle_epi32(lo, 78); |
| 200 | lo = _mm_xor_si128(lo, r); |
| 201 | lo = _mm_xor_si128(hi, lo); |
| 202 | |
| 203 | return lo; |
| 204 | } |
| 205 | |
| 206 | struct ptls_fusion_gfmul_state128 { |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 207 | __m128i hi, lo, mid; |
| 208 | }; |
| 209 | |
Kazuho Oku | 9f8e12a | 2022-05-11 08:21:45 +0900 | [diff] [blame] | 210 | #if defined(__GNUC__) && !defined(__clang__) |
| 211 | static inline __m128i xor128(__m128i x, __m128i y) |
| 212 | { |
| 213 | __m128i ret; |
| 214 | __asm__("vpxor %2, %1, %0" : "=x"(ret) : "x"(x), "xm"(y)); |
| 215 | return ret; |
| 216 | } |
| 217 | #else |
| 218 | #define xor128 _mm_xor_si128 |
| 219 | #endif |
| 220 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 221 | static inline void gfmul_do_step128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X, |
| 222 | struct ptls_fusion_aesgcm_ghash_precompute128 *precompute) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 223 | { |
Kazuho Oku | 9f8e12a | 2022-05-11 08:21:45 +0900 | [diff] [blame] | 224 | __m128i t1 = _mm_clmulepi64_si128(precompute->H, X, 0x00); |
| 225 | __m128i t2 = _mm_clmulepi64_si128(precompute->H, X, 0x11); |
| 226 | __m128i t3 = _mm_shuffle_epi32(X, 78); |
| 227 | t3 = _mm_xor_si128(t3, X); |
| 228 | t3 = _mm_clmulepi64_si128(precompute->r, t3, 0x00); |
| 229 | gstate->lo = xor128(gstate->lo, t1); |
| 230 | gstate->hi = xor128(gstate->hi, t2); |
| 231 | gstate->mid = xor128(gstate->mid, t3); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 232 | } |
| 233 | |
Kazuho Oku | 9f8e12a | 2022-05-11 08:21:45 +0900 | [diff] [blame] | 234 | #undef xor128 |
| 235 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 236 | static inline void gfmul_firststep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X, |
| 237 | struct ptls_fusion_aesgcm_ghash_precompute128 *precompute) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 238 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 239 | X = _mm_shuffle_epi8(X, byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 240 | X = _mm_xor_si128(gstate->lo, X); |
| 241 | gstate->lo = _mm_setzero_si128(); |
| 242 | gstate->hi = _mm_setzero_si128(); |
| 243 | gstate->mid = _mm_setzero_si128(); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 244 | gfmul_do_step128(gstate, X, precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 245 | } |
| 246 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 247 | static inline void gfmul_nextstep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X, |
| 248 | struct ptls_fusion_aesgcm_ghash_precompute128 *precompute) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 249 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 250 | X = _mm_shuffle_epi8(X, byteswap128); |
| 251 | gfmul_do_step128(gstate, X, precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 252 | } |
| 253 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 254 | static inline void gfmul_reduce128(struct ptls_fusion_gfmul_state128 *gstate) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 255 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 256 | gstate->lo = gfmul_do_reduce(gstate->hi, gstate->lo, gstate->mid); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 257 | } |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 258 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 259 | static inline __m128i gfmul_get_tag128(struct ptls_fusion_gfmul_state128 *gstate, __m128i ek0) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 260 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 261 | __m128i tag = _mm_shuffle_epi8(gstate->lo, byteswap128); |
| 262 | tag = _mm_xor_si128(tag, ek0); |
| 263 | return tag; |
| 264 | } |
| 265 | |
| 266 | struct ptls_fusion_gfmul_state256 { |
| 267 | __m256i hi, lo, mid; |
| 268 | }; |
| 269 | |
| 270 | static inline void gfmul_do_step256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, |
| 271 | union ptls_fusion_aesgcm_ghash_precompute256 *precompute) |
| 272 | { |
| 273 | __m256i t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x00); |
| 274 | gstate->lo = _mm256_xor_si256(gstate->lo, t); |
| 275 | t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x11); |
| 276 | gstate->hi = _mm256_xor_si256(gstate->hi, t); |
| 277 | t = _mm256_shuffle_epi32(X, 78); |
| 278 | t = _mm256_xor_si256(t, X); |
| 279 | t = _mm256_clmulepi64_epi128(precompute->rx2, t, 0x00); |
| 280 | gstate->mid = _mm256_xor_si256(gstate->mid, t); |
| 281 | } |
| 282 | |
| 283 | static inline void gfmul_firststep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, int half, |
| 284 | union ptls_fusion_aesgcm_ghash_precompute256 *precompute) |
| 285 | { |
| 286 | X = _mm256_shuffle_epi8(X, byteswap256); |
| 287 | X = _mm256_xor_si256(gstate->lo, X); |
| 288 | if (half) |
| 289 | X = _mm256_permute2f128_si256(X, X, 0x08); |
| 290 | gstate->lo = _mm256_setzero_si256(); |
| 291 | gstate->hi = _mm256_setzero_si256(); |
| 292 | gstate->mid = _mm256_setzero_si256(); |
| 293 | gfmul_do_step256(gstate, X, precompute); |
| 294 | } |
| 295 | |
| 296 | static inline void gfmul_nextstep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, |
| 297 | union ptls_fusion_aesgcm_ghash_precompute256 *precompute) |
| 298 | { |
| 299 | X = _mm256_shuffle_epi8(X, byteswap256); |
| 300 | gfmul_do_step256(gstate, X, precompute); |
| 301 | } |
| 302 | |
| 303 | static inline void gfmul_reduce256(struct ptls_fusion_gfmul_state256 *gstate) |
| 304 | { |
| 305 | #define XOR_256TO128(y) _mm_xor_si128(_mm256_castsi256_si128(y), _mm256_extractf128_si256((y), 1)) |
| 306 | __m128i hi = XOR_256TO128(gstate->hi); |
| 307 | __m128i lo = XOR_256TO128(gstate->lo); |
| 308 | __m128i mid = XOR_256TO128(gstate->mid); |
| 309 | #undef XOR_256TO128 |
| 310 | |
| 311 | lo = gfmul_do_reduce(hi, lo, mid); |
| 312 | gstate->lo = _mm256_castsi128_si256(lo); |
| 313 | } |
| 314 | |
| 315 | static inline __m128i gfmul_get_tag256(struct ptls_fusion_gfmul_state256 *gstate, __m128i ek0) |
| 316 | { |
| 317 | __m128i tag = _mm_shuffle_epi8(_mm256_castsi256_si128(gstate->lo), byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 318 | tag = _mm_xor_si128(tag, ek0); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 319 | return tag; |
| 320 | } |
| 321 | |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 322 | static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v) |
| 323 | { |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 324 | #define ROUNDKEY(i) (ctx->aesni256 ? _mm256_castsi256_si128(ctx->keys.m256[i]) : ctx->keys.m128[i]) |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 325 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 326 | v = _mm_xor_si128(v, ROUNDKEY(0)); |
| 327 | for (size_t i = 1; i < ctx->rounds; ++i) |
| 328 | v = _mm_aesenc_si128(v, ROUNDKEY(i)); |
| 329 | v = _mm_aesenclast_si128(v, ROUNDKEY(ctx->rounds)); |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 330 | |
| 331 | return v; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 332 | |
| 333 | #undef ROUNDKEY |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 334 | } |
| 335 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 336 | // 32-bytes of 0xff followed by 31-bytes of 0x00 |
| 337 | static const uint8_t loadn_mask[63] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 338 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 339 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 340 | static const uint8_t loadn_shuffle[31] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 341 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // first 16 bytes map to byte offsets |
| 342 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| 343 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // latter 15 bytes map to zero |
| 344 | |
Kazuho Oku | 8b9cd57 | 2022-05-04 13:44:33 +0900 | [diff] [blame] | 345 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 346 | static inline __m128i loadn_end_of_page(const void *p, size_t l) |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 347 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 348 | uintptr_t shift = (uintptr_t)p & 15; |
| 349 | __m128i pattern = _mm_loadu_si128((const __m128i *)(loadn_shuffle + shift)); |
| 350 | return _mm_shuffle_epi8(_mm_load_si128((const __m128i *)((uintptr_t)p - shift)), pattern); |
| 351 | } |
| 352 | |
Kazuho Oku | 196e477 | 2022-05-10 09:52:07 +0900 | [diff] [blame] | 353 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 354 | static inline __m128i loadn128(const void *p, size_t l) |
| 355 | { |
| 356 | __m128i v, mask = _mm_loadu_si128((__m128i *)(loadn_mask + 32 - l)); |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 357 | uintptr_t mod4k = (uintptr_t)p % 4096; |
Goro Fuji | 9a99cf1 | 2021-11-30 12:07:24 +0000 | [diff] [blame] | 358 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 359 | if (PTLS_LIKELY(mod4k <= 4096 - 16) || mod4k + l > 4096) { |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 360 | v = _mm_loadu_si128(p); |
Kazuho Oku | 079b1d0 | 2020-05-14 02:24:28 +0900 | [diff] [blame] | 361 | } else { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 362 | v = loadn_end_of_page(p, l); |
Kazuho Oku | 079b1d0 | 2020-05-14 02:24:28 +0900 | [diff] [blame] | 363 | } |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 364 | v = _mm_and_si128(v, mask); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 365 | |
MITSUNARI Shigeo | eeff164 | 2020-05-19 14:46:39 +0900 | [diff] [blame] | 366 | return v; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 367 | } |
| 368 | |
Kazuho Oku | 196e477 | 2022-05-10 09:52:07 +0900 | [diff] [blame] | 369 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 370 | static inline __m256i loadn256(const void *p, size_t l) |
| 371 | { |
| 372 | __m256i v, mask = _mm256_loadu_si256((__m256i *)(loadn_mask + 32 - l)); |
| 373 | uintptr_t mod4k = (uintptr_t)p % 4096; |
| 374 | |
| 375 | if (PTLS_LIKELY(mod4k < 4096 - 32) || mod4k + l > 4096) { |
| 376 | v = _mm256_loadu_si256(p); |
| 377 | } else if (l > 16) { |
Kazuho Oku | 7f165e0 | 2022-05-10 07:04:06 +0900 | [diff] [blame] | 378 | __m128i first16 = _mm_loadu_si128(p), second16 = loadn128((uint8_t *)p + 16, l - 16); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 379 | v = _mm256_permute2f128_si256(_mm256_castsi128_si256(first16), _mm256_castsi128_si256(second16), 0x20); |
| 380 | } else if (l == 16) { |
| 381 | v = _mm256_castsi128_si256(_mm_loadu_si128(p)); |
| 382 | } else { |
| 383 | v = _mm256_castsi128_si256(loadn_end_of_page(p, l)); |
| 384 | } |
| 385 | v = _mm256_and_si256(v, mask); |
| 386 | |
| 387 | return v; |
| 388 | } |
| 389 | |
| 390 | static inline void storen128(void *_p, size_t l, __m128i v) |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 391 | { |
| 392 | uint8_t buf[16], *p = _p; |
| 393 | |
| 394 | *(__m128i *)buf = v; |
| 395 | |
| 396 | for (size_t i = 0; i != l; ++i) |
| 397 | p[i] = buf[i]; |
| 398 | } |
| 399 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 400 | void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr, |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 401 | const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 402 | { |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 403 | /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ |
| 404 | #define AESECB6_INIT() \ |
| 405 | do { \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 406 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 407 | bits0 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 408 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 409 | bits1 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 410 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 411 | bits2 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 412 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 413 | bits3 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 414 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 415 | bits4 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 416 | if (PTLS_LIKELY(srclen > 16 * 5)) { \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 417 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 418 | bits5 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 419 | } else { \ |
| 420 | if ((state & STATE_EK0_BEEN_FED) == 0) { \ |
| 421 | bits5 = ek0; \ |
| 422 | state |= STATE_EK0_BEEN_FED; \ |
| 423 | } \ |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 424 | if ((state & STATE_SUPP_USED) != 0 && srclen <= 16 * 4 && (const __m128i *)supp->input + 1 <= dst_ghash) { \ |
| 425 | bits4 = _mm_loadu_si128(supp->input); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 426 | bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128; \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 427 | state |= STATE_SUPP_IN_PROCESS; \ |
| 428 | } \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 429 | } \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 430 | __m128i k = ctx->super.ecb.keys.m128[0]; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 431 | bits0 = _mm_xor_si128(bits0, k); \ |
| 432 | bits1 = _mm_xor_si128(bits1, k); \ |
| 433 | bits2 = _mm_xor_si128(bits2, k); \ |
| 434 | bits3 = _mm_xor_si128(bits3, k); \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 435 | bits4 = _mm_xor_si128(bits4, bits4keys[0]); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 436 | bits5 = _mm_xor_si128(bits5, k); \ |
| 437 | } while (0) |
| 438 | |
| 439 | /* aes block update */ |
| 440 | #define AESECB6_UPDATE(i) \ |
| 441 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 442 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 443 | bits0 = _mm_aesenc_si128(bits0, k); \ |
| 444 | bits1 = _mm_aesenc_si128(bits1, k); \ |
| 445 | bits2 = _mm_aesenc_si128(bits2, k); \ |
| 446 | bits3 = _mm_aesenc_si128(bits3, k); \ |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 447 | bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 448 | bits5 = _mm_aesenc_si128(bits5, k); \ |
| 449 | } while (0) |
| 450 | |
| 451 | /* aesenclast */ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 452 | #define AESECB6_FINAL(i) \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 453 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 454 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 455 | bits0 = _mm_aesenclast_si128(bits0, k); \ |
| 456 | bits1 = _mm_aesenclast_si128(bits1, k); \ |
| 457 | bits2 = _mm_aesenclast_si128(bits2, k); \ |
| 458 | bits3 = _mm_aesenclast_si128(bits3, k); \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 459 | bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 460 | bits5 = _mm_aesenclast_si128(bits5, k); \ |
| 461 | } while (0) |
| 462 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 463 | struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 464 | __m128i ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 465 | const __m128i *bits4keys = ctx->super.ecb.keys.m128; /* is changed to supp->ctx.keys when calcurating suppout */ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 466 | struct ptls_fusion_gfmul_state128 gstate = {0}; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 467 | __m128i gdatabuf[6]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 468 | __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 469 | |
| 470 | // src and dst are updated after the chunk is processed |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 471 | const __m128i *src = input; |
| 472 | __m128i *dst = output; |
| 473 | size_t srclen = inlen; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 474 | // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor) |
| 475 | const __m128i *aad = _aad, *dst_ghash = dst; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 476 | size_t dst_ghashlen = srclen; |
| 477 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 478 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 479 | |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 480 | #define STATE_EK0_BEEN_FED 0x3 |
| 481 | #define STATE_EK0_INCOMPLETE 0x2 |
| 482 | #define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1) |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 483 | #define STATE_SUPP_USED 0x4 |
| 484 | #define STATE_SUPP_IN_PROCESS 0x8 |
| 485 | int32_t state = supp != NULL ? STATE_SUPP_USED : 0; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 486 | |
| 487 | /* build counter */ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 488 | ctr = _mm_insert_epi32(ctr, 1, 0); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 489 | ek0 = _mm_shuffle_epi8(ctr, byteswap128); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 490 | |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 491 | /* start preparing AES */ |
| 492 | AESECB6_INIT(); |
| 493 | AESECB6_UPDATE(1); |
| 494 | |
| 495 | /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */ |
| 496 | const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH |
| 497 | size_t gdata_cnt = 0; |
| 498 | if (PTLS_LIKELY(aadlen != 0)) { |
| 499 | while (gdata_cnt < 6) { |
| 500 | if (PTLS_LIKELY(aadlen < 16)) { |
| 501 | if (aadlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 502 | gdatabuf[gdata_cnt++] = loadn128(aad, aadlen); |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 503 | aadlen = 0; |
| 504 | } |
| 505 | goto MainLoop; |
| 506 | } |
| 507 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); |
| 508 | aadlen -= 16; |
| 509 | } |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 510 | } |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 511 | |
| 512 | /* the main loop */ |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 513 | MainLoop: |
Kazuho Oku | 303153d | 2020-05-08 16:42:16 +0900 | [diff] [blame] | 514 | while (1) { |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 515 | /* run AES and multiplication in parallel */ |
| 516 | size_t i; |
| 517 | for (i = 2; i < gdata_cnt + 2; ++i) { |
| 518 | AESECB6_UPDATE(i); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 519 | gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 520 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 521 | for (; i < ctx->super.ecb.rounds; ++i) |
Kazuho Oku | d8dc699 | 2020-05-19 14:02:21 +0900 | [diff] [blame] | 522 | AESECB6_UPDATE(i); |
| 523 | AESECB6_FINAL(i); |
| 524 | |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 525 | /* apply the bit stream to src and write to dest */ |
| 526 | if (PTLS_LIKELY(srclen >= 6 * 16)) { |
| 527 | #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i)) |
| 528 | APPLY(0); |
| 529 | APPLY(1); |
| 530 | APPLY(2); |
| 531 | APPLY(3); |
| 532 | APPLY(4); |
| 533 | APPLY(5); |
Kazuho Oku | 083f531 | 2020-05-07 13:05:10 +0900 | [diff] [blame] | 534 | #undef APPLY |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 535 | dst += 6; |
| 536 | src += 6; |
| 537 | srclen -= 6 * 16; |
| 538 | } else { |
| 539 | if ((state & STATE_EK0_BEEN_FED) == STATE_EK0_BEEN_FED) { |
| 540 | ek0 = bits5; |
| 541 | state &= ~STATE_EK0_INCOMPLETE; |
| 542 | } |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 543 | if ((state & STATE_SUPP_IN_PROCESS) != 0) { |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 544 | _mm_storeu_si128((__m128i *)supp->output, bits4); |
| 545 | state &= ~(STATE_SUPP_USED | STATE_SUPP_IN_PROCESS); |
Kazuho Oku | a1a81e6 | 2020-05-09 03:46:46 +0900 | [diff] [blame] | 546 | } |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 547 | if (srclen != 0) { |
| 548 | #define APPLY(i) \ |
| 549 | do { \ |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 550 | if (PTLS_LIKELY(srclen >= 16)) { \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 551 | _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \ |
| 552 | srclen -= 16; \ |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 553 | } else if (PTLS_LIKELY(srclen != 0)) { \ |
| 554 | bits0 = bits##i; \ |
| 555 | goto ApplyRemainder; \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 556 | } else { \ |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 557 | goto ApplyEnd; \ |
| 558 | } \ |
| 559 | } while (0) |
| 560 | APPLY(0); |
| 561 | APPLY(1); |
| 562 | APPLY(2); |
| 563 | APPLY(3); |
| 564 | APPLY(4); |
| 565 | APPLY(5); |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 566 | #undef APPLY |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 567 | goto ApplyEnd; |
| 568 | ApplyRemainder: |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 569 | storen128(dst, srclen, _mm_xor_si128(loadn128(src, srclen), bits0)); |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 570 | dst = (__m128i *)((uint8_t *)dst + srclen); |
| 571 | srclen = 0; |
| 572 | ApplyEnd:; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 573 | } |
| 574 | } |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 575 | |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 576 | /* next block AES starts here */ |
| 577 | AESECB6_INIT(); |
| 578 | |
| 579 | AESECB6_UPDATE(1); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 580 | |
| 581 | /* setup gdata */ |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 582 | if (PTLS_UNLIKELY(aadlen != 0)) { |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 583 | gdata_cnt = 0; |
| 584 | while (gdata_cnt < 6) { |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 585 | if (aadlen < 16) { |
| 586 | if (aadlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 587 | gdatabuf[gdata_cnt++] = loadn128(aad, aadlen); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 588 | aadlen = 0; |
| 589 | } |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 590 | goto GdataFillDST; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 591 | } |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 592 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 593 | aadlen -= 16; |
| 594 | } |
| 595 | gdata = gdatabuf; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 596 | } else if (PTLS_LIKELY(dst_ghashlen >= 6 * 16)) { |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 597 | gdata = dst_ghash; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 598 | gdata_cnt = 6; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 599 | dst_ghash += 6; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 600 | dst_ghashlen -= 96; |
| 601 | } else { |
| 602 | gdata_cnt = 0; |
| 603 | GdataFillDST: |
| 604 | while (gdata_cnt < 6) { |
| 605 | if (dst_ghashlen < 16) { |
| 606 | if (dst_ghashlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 607 | gdatabuf[gdata_cnt++] = loadn128(dst_ghash, dst_ghashlen); |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 608 | dst_ghashlen = 0; |
| 609 | } |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 610 | if (gdata_cnt < 6) |
| 611 | goto Finish; |
Kazuho Oku | 274a572 | 2020-05-07 22:56:07 +0900 | [diff] [blame] | 612 | break; |
| 613 | } |
| 614 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++); |
| 615 | dst_ghashlen -= 16; |
| 616 | } |
| 617 | gdata = gdatabuf; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 618 | } |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 619 | } |
| 620 | |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 621 | Finish: |
| 622 | gdatabuf[gdata_cnt++] = ac; |
| 623 | |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 624 | /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation. |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 625 | * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM |
| 626 | * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead. |
| 627 | */ |
| 628 | assert(STATE_EK0_READY()); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 629 | for (size_t i = 0; i < gdata_cnt; ++i) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 630 | gfmul_nextstep128(&gstate, gdatabuf[i], --ghash_precompute); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 631 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 632 | gfmul_reduce128(&gstate); |
| 633 | _mm_storeu_si128(dst, gfmul_get_tag128(&gstate, ek0)); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 634 | |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 635 | /* Finish the calculation of supplemental vector. Done at the very last, because the sample might cover the GCM tag. */ |
| 636 | if ((state & STATE_SUPP_USED) != 0) { |
| 637 | size_t i; |
| 638 | if ((state & STATE_SUPP_IN_PROCESS) == 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 639 | bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128; |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 640 | bits4 = _mm_xor_si128(_mm_loadu_si128(supp->input), bits4keys[0]); |
| 641 | i = 1; |
| 642 | } else { |
| 643 | i = 2; |
| 644 | } |
| 645 | do { |
| 646 | bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 647 | } while (i != ctx->super.ecb.rounds); |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 648 | bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 649 | _mm_storeu_si128((__m128i *)supp->output, bits4); |
| 650 | } |
| 651 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 652 | #undef AESECB6_INIT |
| 653 | #undef AESECB6_UPDATE |
| 654 | #undef AESECB6_FINAL |
| 655 | #undef STATE_EK0_BEEN_FOUND |
| 656 | #undef STATE_EK0_READY |
| 657 | #undef STATE_SUPP_IN_PROCESS |
| 658 | } |
| 659 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 660 | int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr, |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 661 | const void *_aad, size_t aadlen, const void *tag) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 662 | { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 663 | struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 664 | __m128i ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(), |
| 665 | bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128(); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 666 | struct ptls_fusion_gfmul_state128 gstate = {0}; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 667 | __m128i gdatabuf[6]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 668 | __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 669 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 670 | |
| 671 | const __m128i *gdata; // points to the elements fed into GHASH |
| 672 | size_t gdata_cnt; |
| 673 | |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 674 | const __m128i *src_ghash = input, *src_aes = input, *aad = _aad; |
| 675 | __m128i *dst = output; |
| 676 | size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 677 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 678 | /* schedule ek0 and suppkey */ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 679 | ctr = _mm_add_epi64(ctr, one8); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 680 | bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), ctx->super.ecb.keys.m128[0]); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 681 | ++nondata_aes_cnt; |
Kazuho Oku | 91c3b18 | 2020-05-10 05:25:48 +0900 | [diff] [blame] | 682 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 683 | #define STATE_IS_FIRST_RUN 0x1 |
| 684 | #define STATE_GHASH_HAS_MORE 0x2 |
| 685 | int state = STATE_IS_FIRST_RUN | STATE_GHASH_HAS_MORE; |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 686 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 687 | /* the main loop */ |
| 688 | while (1) { |
| 689 | |
| 690 | /* setup gdata */ |
| 691 | if (PTLS_UNLIKELY(aadlen != 0)) { |
| 692 | gdata = gdatabuf; |
| 693 | gdata_cnt = 0; |
| 694 | while (gdata_cnt < 6) { |
| 695 | if (aadlen < 16) { |
| 696 | if (aadlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 697 | gdatabuf[gdata_cnt++] = loadn128(aad, aadlen); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 698 | aadlen = 0; |
| 699 | ++nondata_aes_cnt; |
| 700 | } |
| 701 | goto GdataFillSrc; |
| 702 | } |
| 703 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++); |
| 704 | aadlen -= 16; |
| 705 | ++nondata_aes_cnt; |
| 706 | } |
| 707 | } else if (PTLS_LIKELY(src_ghashlen >= 6 * 16)) { |
| 708 | gdata = src_ghash; |
| 709 | gdata_cnt = 6; |
| 710 | src_ghash += 6; |
| 711 | src_ghashlen -= 6 * 16; |
| 712 | } else { |
| 713 | gdata = gdatabuf; |
| 714 | gdata_cnt = 0; |
| 715 | GdataFillSrc: |
| 716 | while (gdata_cnt < 6) { |
| 717 | if (src_ghashlen < 16) { |
| 718 | if (src_ghashlen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 719 | gdatabuf[gdata_cnt++] = loadn128(src_ghash, src_ghashlen); |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 720 | src_ghash = (__m128i *)((uint8_t *)src_ghash + src_ghashlen); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 721 | src_ghashlen = 0; |
| 722 | } |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 723 | if (gdata_cnt < 6 && (state & STATE_GHASH_HAS_MORE) != 0) { |
| 724 | gdatabuf[gdata_cnt++] = ac; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 725 | state &= ~STATE_GHASH_HAS_MORE; |
| 726 | } |
| 727 | break; |
| 728 | } |
| 729 | gdatabuf[gdata_cnt++] = _mm_loadu_si128(src_ghash++); |
| 730 | src_ghashlen -= 16; |
| 731 | } |
| 732 | } |
| 733 | |
| 734 | /* setup aes bits */ |
| 735 | if (PTLS_LIKELY(nondata_aes_cnt == 0)) |
| 736 | goto InitAllBits; |
| 737 | switch (nondata_aes_cnt) { |
Kazuho Oku | 66a95e5 | 2020-05-12 12:57:37 +0900 | [diff] [blame] | 738 | #define INIT_BITS(n, keys) \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 739 | case n: \ |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 740 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 741 | bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), keys.m128[0]); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 742 | InitAllBits: |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 743 | INIT_BITS(0, ctx->super.ecb.keys); |
| 744 | INIT_BITS(1, ctx->super.ecb.keys); |
| 745 | INIT_BITS(2, ctx->super.ecb.keys); |
| 746 | INIT_BITS(3, ctx->super.ecb.keys); |
| 747 | INIT_BITS(4, ctx->super.ecb.keys); |
| 748 | INIT_BITS(5, ctx->super.ecb.keys); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 749 | #undef INIT_BITS |
| 750 | } |
| 751 | |
| 752 | { /* run aes and ghash */ |
| 753 | #define AESECB6_UPDATE(i) \ |
| 754 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 755 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 756 | bits0 = _mm_aesenc_si128(bits0, k); \ |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 757 | bits1 = _mm_aesenc_si128(bits1, k); \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 758 | bits2 = _mm_aesenc_si128(bits2, k); \ |
| 759 | bits3 = _mm_aesenc_si128(bits3, k); \ |
| 760 | bits4 = _mm_aesenc_si128(bits4, k); \ |
| 761 | bits5 = _mm_aesenc_si128(bits5, k); \ |
| 762 | } while (0) |
| 763 | |
| 764 | size_t aesi; |
| 765 | for (aesi = 1; aesi <= gdata_cnt; ++aesi) { |
| 766 | AESECB6_UPDATE(aesi); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 767 | gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 768 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 769 | for (; aesi < ctx->super.ecb.rounds; ++aesi) |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 770 | AESECB6_UPDATE(aesi); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 771 | __m128i k = ctx->super.ecb.keys.m128[aesi]; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 772 | bits0 = _mm_aesenclast_si128(bits0, k); |
Kazuho Oku | 1cf91f6 | 2020-05-13 15:11:14 +0900 | [diff] [blame] | 773 | bits1 = _mm_aesenclast_si128(bits1, k); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 774 | bits2 = _mm_aesenclast_si128(bits2, k); |
| 775 | bits3 = _mm_aesenclast_si128(bits3, k); |
| 776 | bits4 = _mm_aesenclast_si128(bits4, k); |
| 777 | bits5 = _mm_aesenclast_si128(bits5, k); |
| 778 | |
| 779 | #undef AESECB6_UPDATE |
| 780 | } |
| 781 | |
| 782 | /* apply aes bits */ |
| 783 | if (PTLS_LIKELY(nondata_aes_cnt == 0 && src_aeslen >= 6 * 16)) { |
| 784 | #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src_aes + i), bits##i)) |
| 785 | APPLY(0); |
| 786 | APPLY(1); |
| 787 | APPLY(2); |
| 788 | APPLY(3); |
| 789 | APPLY(4); |
| 790 | APPLY(5); |
| 791 | #undef APPLY |
| 792 | dst += 6; |
| 793 | src_aes += 6; |
| 794 | src_aeslen -= 6 * 16; |
| 795 | } else { |
| 796 | if ((state & STATE_IS_FIRST_RUN) != 0) { |
| 797 | ek0 = bits0; |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 798 | state &= ~STATE_IS_FIRST_RUN; |
| 799 | } |
| 800 | switch (nondata_aes_cnt) { |
| 801 | #define APPLY(i) \ |
| 802 | case i: \ |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 803 | if (PTLS_LIKELY(src_aeslen > 16)) { \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 804 | _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \ |
| 805 | src_aeslen -= 16; \ |
| 806 | } else { \ |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 807 | bits0 = bits##i; \ |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 808 | goto Finish; \ |
| 809 | } |
| 810 | APPLY(0); |
| 811 | APPLY(1); |
| 812 | APPLY(2); |
| 813 | APPLY(3); |
| 814 | APPLY(4); |
| 815 | APPLY(5); |
| 816 | #undef APPLY |
| 817 | } |
| 818 | nondata_aes_cnt = 0; |
| 819 | } |
| 820 | } |
| 821 | |
| 822 | Finish: |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 823 | if (src_aeslen == 16) { |
| 824 | _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits0)); |
| 825 | } else if (src_aeslen != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 826 | storen128(dst, src_aeslen, _mm_xor_si128(loadn128(src_aes, src_aeslen), bits0)); |
Kazuho Oku | ea21c50 | 2020-05-18 16:25:53 +0900 | [diff] [blame] | 827 | } |
| 828 | |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 829 | assert((state & STATE_IS_FIRST_RUN) == 0); |
| 830 | |
| 831 | /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */ |
| 832 | if ((state & STATE_GHASH_HAS_MORE) != 0) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 833 | assert(ghash_precompute - 1 == ctx->ghash); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 834 | gfmul_nextstep128(&gstate, ac, --ghash_precompute); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 835 | } |
| 836 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 837 | gfmul_reduce128(&gstate); |
| 838 | __m128i calctag = gfmul_get_tag128(&gstate, ek0); |
Kazuho Oku | 8b4dfee | 2020-05-10 12:09:03 +0900 | [diff] [blame] | 839 | |
| 840 | return _mm_movemask_epi8(_mm_cmpeq_epi8(calctag, _mm_loadu_si128(tag))) == 0xffff; |
| 841 | |
| 842 | #undef STATE_IS_FIRST_RUN |
| 843 | #undef STATE_GHASH_HAS_MORE |
Kazuho Oku | fa13ede | 2020-05-06 16:19:57 +0900 | [diff] [blame] | 844 | } |
| 845 | |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 846 | static __m128i expand_key(__m128i key, __m128i temp) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 847 | { |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 848 | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
| 849 | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
| 850 | key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 851 | |
| 852 | key = _mm_xor_si128(key, temp); |
| 853 | |
| 854 | return key; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 855 | } |
| 856 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 857 | void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size, int aesni256) |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 858 | { |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 859 | assert(is_enc && "decryption is not supported (yet)"); |
| 860 | |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 861 | size_t i = 0; |
| 862 | |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 863 | switch (key_size) { |
| 864 | case 16: /* AES128 */ |
| 865 | ctx->rounds = 10; |
| 866 | break; |
| 867 | case 32: /* AES256 */ |
| 868 | ctx->rounds = 14; |
| 869 | break; |
| 870 | default: |
| 871 | assert(!"invalid key size; AES128 / AES256 are supported"); |
| 872 | break; |
| 873 | } |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 874 | ctx->aesni256 = aesni256; |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 875 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 876 | /* load and expand keys using keys.m128 */ |
| 877 | ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key); |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 878 | if (key_size == 32) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 879 | ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key + 1); |
| 880 | while (1) { |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 881 | #define EXPAND(R) \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 882 | { \ |
| 883 | ctx->keys.m128[i] = \ |
| 884 | expand_key(ctx->keys.m128[i - key_size / 16], \ |
| 885 | _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(3, 3, 3, 3))); \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 886 | if (i == ctx->rounds) \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 887 | break; \ |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 888 | ++i; \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 889 | if (key_size > 24) { \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 890 | ctx->keys.m128[i] = \ |
| 891 | expand_key(ctx->keys.m128[i - key_size / 16], \ |
| 892 | _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(2, 2, 2, 2))); \ |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 893 | ++i; \ |
| 894 | } \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 895 | } |
| 896 | EXPAND(0x1); |
| 897 | EXPAND(0x2); |
| 898 | EXPAND(0x4); |
| 899 | EXPAND(0x8); |
| 900 | EXPAND(0x10); |
| 901 | EXPAND(0x20); |
| 902 | EXPAND(0x40); |
| 903 | EXPAND(0x80); |
| 904 | EXPAND(0x1b); |
| 905 | EXPAND(0x36); |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 906 | #undef EXPAND |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 907 | } |
| 908 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 909 | /* convert to keys.m256 if aesni256 is used */ |
| 910 | if (ctx->aesni256) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 911 | size_t i = ctx->rounds; |
| 912 | do { |
| 913 | ctx->keys.m256[i] = _mm256_broadcastsi128_si256(ctx->keys.m128[i]); |
| 914 | } while (i-- != 0); |
| 915 | } |
Kazuho Oku | e46529c | 2020-05-08 13:38:39 +0900 | [diff] [blame] | 916 | } |
| 917 | |
| 918 | void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx) |
| 919 | { |
| 920 | ptls_clear_memory(ctx, sizeof(*ctx)); |
| 921 | } |
| 922 | |
Kazuho Oku | 4c19f50 | 2020-05-15 08:30:35 +0900 | [diff] [blame] | 923 | void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src) |
| 924 | { |
| 925 | __m128i v = _mm_loadu_si128(src); |
| 926 | v = aesecb_encrypt(ctx, v); |
| 927 | _mm_storeu_si128(dst, v); |
| 928 | } |
| 929 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 930 | /** |
| 931 | * returns the number of ghash entries that is required to handle an AEAD block of given size |
| 932 | */ |
| 933 | static size_t aesgcm_calc_ghash_cnt(size_t capacity) |
| 934 | { |
| 935 | // round-up by block size, add to handle worst split of the size between AAD and payload, plus context to hash AC |
| 936 | return (capacity + 15) / 16 + 2; |
| 937 | } |
| 938 | |
| 939 | static void setup_one_ghash_entry(ptls_fusion_aesgcm_context_t *ctx) |
| 940 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 941 | __m128i *H, *r, *Hprev, H0; |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 942 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 943 | if (ctx->ecb.aesni256) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 944 | struct ptls_fusion_aesgcm_context256 *ctx256 = (void *)ctx; |
| 945 | #define GET_SLOT(i, mem) (&ctx256->ghash[(i) / 2].mem[(i) % 2 == 0]) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 946 | H = GET_SLOT(ctx->ghash_cnt, H); |
| 947 | r = GET_SLOT(ctx->ghash_cnt, r); |
| 948 | Hprev = ctx->ghash_cnt == 0 ? NULL : GET_SLOT(ctx->ghash_cnt - 1, H); |
| 949 | #undef GET_SLOT |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 950 | H0 = ctx256->ghash[0].H[1]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 951 | } else { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 952 | struct ptls_fusion_aesgcm_context128 *ctx128 = (void *)ctx; |
| 953 | H = &ctx128->ghash[ctx->ghash_cnt].H; |
| 954 | r = &ctx128->ghash[ctx->ghash_cnt].r; |
| 955 | Hprev = ctx->ghash_cnt == 0 ? NULL : &ctx128->ghash[ctx->ghash_cnt - 1].H; |
| 956 | H0 = ctx128->ghash[0].H; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 957 | } |
| 958 | |
| 959 | if (Hprev != NULL) |
| 960 | *H = gfmul(*Hprev, H0); |
| 961 | |
| 962 | *r = _mm_shuffle_epi32(*H, 78); |
| 963 | *r = _mm_xor_si128(*r, *H); |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 964 | |
| 965 | ++ctx->ghash_cnt; |
| 966 | } |
| 967 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 968 | static size_t calc_aesgcm_context_size(size_t *ghash_cnt, int aesni256) |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 969 | { |
| 970 | size_t sz; |
| 971 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 972 | if (aesni256) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 973 | if (*ghash_cnt % 2 != 0) |
| 974 | ++*ghash_cnt; |
| 975 | sz = offsetof(struct ptls_fusion_aesgcm_context256, ghash) + |
| 976 | sizeof(union ptls_fusion_aesgcm_ghash_precompute256) * *ghash_cnt / 2; |
| 977 | } else { |
| 978 | sz = offsetof(struct ptls_fusion_aesgcm_context128, ghash) + |
| 979 | sizeof(struct ptls_fusion_aesgcm_ghash_precompute128) * *ghash_cnt; |
| 980 | } |
| 981 | return sz; |
| 982 | } |
| 983 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 984 | static ptls_fusion_aesgcm_context_t *new_aesgcm(const void *key, size_t key_size, size_t capacity, int aesni256) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 985 | { |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 986 | ptls_fusion_aesgcm_context_t *ctx; |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 987 | size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity), ctx_size = calc_aesgcm_context_size(&ghash_cnt, aesni256); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 988 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 989 | if ((ctx = aligned_alloc(32, ctx_size)) == NULL) |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 990 | return NULL; |
| 991 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 992 | ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size, aesni256); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 993 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 994 | ctx->capacity = capacity; |
| 995 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 996 | __m128i H0 = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128()); |
| 997 | H0 = _mm_shuffle_epi8(H0, byteswap128); |
| 998 | H0 = transformH(H0); |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 999 | if (ctx->ecb.aesni256) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1000 | ((struct ptls_fusion_aesgcm_context256 *)ctx)->ghash[0].H[1] = H0; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1001 | } else { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1002 | ((struct ptls_fusion_aesgcm_context128 *)ctx)->ghash[0].H = H0; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1003 | } |
| 1004 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1005 | ctx->ghash_cnt = 0; |
| 1006 | while (ctx->ghash_cnt < ghash_cnt) |
| 1007 | setup_one_ghash_entry(ctx); |
| 1008 | |
| 1009 | return ctx; |
| 1010 | } |
| 1011 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1012 | ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity) |
| 1013 | { |
| 1014 | return new_aesgcm(key, key_size, capacity, 0); |
| 1015 | } |
| 1016 | |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1017 | ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity) |
| 1018 | { |
Kazuho Oku | 75e71f5 | 2022-09-15 09:43:03 +0900 | [diff] [blame] | 1019 | size_t new_ghash_cnt = aesgcm_calc_ghash_cnt(capacity); |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1020 | |
Kazuho Oku | 75e71f5 | 2022-09-15 09:43:03 +0900 | [diff] [blame] | 1021 | if (new_ghash_cnt <= ctx->ghash_cnt) |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1022 | return ctx; |
| 1023 | |
Kazuho Oku | 998f2e0 | 2022-09-15 13:10:26 +0900 | [diff] [blame] | 1024 | size_t new_ctx_size = calc_aesgcm_context_size(&new_ghash_cnt, ctx->ecb.aesni256), |
| 1025 | old_ctx_size = calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.aesni256); |
| 1026 | |
Kazuho Oku | 688d70c | 2022-05-11 13:46:22 +0900 | [diff] [blame] | 1027 | ptls_fusion_aesgcm_context_t *newp; |
Kazuho Oku | 75e71f5 | 2022-09-15 09:43:03 +0900 | [diff] [blame] | 1028 | if ((newp = aligned_alloc(32, new_ctx_size)) == NULL) |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1029 | return NULL; |
Kazuho Oku | 998f2e0 | 2022-09-15 13:10:26 +0900 | [diff] [blame] | 1030 | memcpy(newp, ctx, old_ctx_size); |
| 1031 | ptls_clear_memory(ctx, old_ctx_size); |
Kazuho Oku | ba56a5d | 2022-12-12 09:24:11 +0900 | [diff] [blame] | 1032 | aligned_free(ctx); |
Kazuho Oku | 688d70c | 2022-05-11 13:46:22 +0900 | [diff] [blame] | 1033 | ctx = newp; |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1034 | |
| 1035 | ctx->capacity = capacity; |
Kazuho Oku | 75e71f5 | 2022-09-15 09:43:03 +0900 | [diff] [blame] | 1036 | while (ctx->ghash_cnt < new_ghash_cnt) |
Kazuho Oku | 7fd7c84 | 2020-05-18 14:04:42 +0900 | [diff] [blame] | 1037 | setup_one_ghash_entry(ctx); |
Kazuho Oku | f198c1b | 2020-05-08 00:45:29 +0900 | [diff] [blame] | 1038 | |
| 1039 | return ctx; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1040 | } |
| 1041 | |
Kazuho Oku | 31ebd7d | 2020-05-15 06:37:23 +0900 | [diff] [blame] | 1042 | void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1043 | { |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 1044 | ptls_clear_memory(ctx, calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.aesni256)); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1045 | /* skip ptls_fusion_aesecb_dispose, based on the knowledge that it does not allocate memory elsewhere */ |
| 1046 | |
Kazuho Oku | ba56a5d | 2022-12-12 09:24:11 +0900 | [diff] [blame] | 1047 | aligned_free(ctx); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1048 | } |
| 1049 | |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1050 | static void ctr_dispose(ptls_cipher_context_t *_ctx) |
| 1051 | { |
| 1052 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1053 | ptls_fusion_aesecb_dispose(&ctx->fusion); |
| 1054 | _mm_storeu_si128(&ctx->bits, _mm_setzero_si128()); |
| 1055 | } |
| 1056 | |
| 1057 | static void ctr_init(ptls_cipher_context_t *_ctx, const void *iv) |
| 1058 | { |
| 1059 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1060 | _mm_storeu_si128(&ctx->bits, aesecb_encrypt(&ctx->fusion, _mm_loadu_si128(iv))); |
| 1061 | ctx->is_ready = 1; |
| 1062 | } |
| 1063 | |
| 1064 | static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void *input, size_t len) |
| 1065 | { |
| 1066 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1067 | |
| 1068 | assert((ctx->is_ready && len <= 16) || |
| 1069 | !"CTR transfomation is supported only once per call to `init` and the maximum size is limited to 16 bytes"); |
| 1070 | ctx->is_ready = 0; |
| 1071 | |
| 1072 | if (len < 16) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1073 | storen128(output, len, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), loadn128(input, len))); |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1074 | } else { |
| 1075 | _mm_storeu_si128(output, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), _mm_loadu_si128(input))); |
| 1076 | } |
| 1077 | } |
| 1078 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1079 | static int aesctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size) |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1080 | { |
| 1081 | struct ctr_context *ctx = (struct ctr_context *)_ctx; |
| 1082 | |
| 1083 | ctx->super.do_dispose = ctr_dispose; |
| 1084 | ctx->super.do_init = ctr_init; |
| 1085 | ctx->super.do_transform = ctr_transform; |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 1086 | ptls_fusion_aesecb_init(&ctx->fusion, 1, key, key_size, 0 /* probably we do not need aesni256 for CTR? */); |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1087 | ctx->is_ready = 0; |
| 1088 | |
| 1089 | return 0; |
| 1090 | } |
| 1091 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1092 | static int aes128ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key) |
| 1093 | { |
| 1094 | return aesctr_setup(ctx, is_enc, key, PTLS_AES128_KEY_SIZE); |
| 1095 | } |
| 1096 | |
| 1097 | static int aes256ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key) |
| 1098 | { |
| 1099 | return aesctr_setup(ctx, is_enc, key, PTLS_AES256_KEY_SIZE); |
| 1100 | } |
| 1101 | |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1102 | static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx) |
| 1103 | { |
| 1104 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
| 1105 | |
Kazuho Oku | 31ebd7d | 2020-05-15 06:37:23 +0900 | [diff] [blame] | 1106 | ptls_fusion_aesgcm_free(ctx->aesgcm); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1107 | } |
| 1108 | |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1109 | static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen) |
| 1110 | { |
| 1111 | assert(!"FIXME"); |
| 1112 | } |
| 1113 | |
| 1114 | static size_t aead_do_encrypt_update(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen) |
| 1115 | { |
| 1116 | assert(!"FIXME"); |
| 1117 | return SIZE_MAX; |
| 1118 | } |
| 1119 | |
| 1120 | static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output) |
| 1121 | { |
| 1122 | assert(!"FIXME"); |
| 1123 | return SIZE_MAX; |
| 1124 | } |
| 1125 | |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1126 | static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1127 | { |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1128 | __m128i ctr = _mm_setzero_si128(); |
| 1129 | ctr = _mm_insert_epi64(ctr, seq, 0); |
Kazuho Oku | 076982f | 2020-05-14 09:28:44 +0900 | [diff] [blame] | 1130 | ctr = _mm_slli_si128(ctr, 4); |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1131 | ctr = _mm_xor_si128(ctx->static_iv, ctr); |
| 1132 | return ctr; |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1133 | } |
| 1134 | |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1135 | void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1136 | const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp) |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1137 | { |
| 1138 | struct aesgcm_context *ctx = (void *)_ctx; |
| 1139 | |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1140 | if (inlen + aadlen > ctx->aesgcm->capacity) |
| 1141 | ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, inlen + aadlen); |
| 1142 | ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad, aadlen, supp); |
Kazuho Oku | 3a50ee1 | 2022-04-27 16:20:16 +0900 | [diff] [blame] | 1143 | } |
| 1144 | |
| 1145 | static void aead_do_encrypt_v(struct st_ptls_aead_context_t *ctx, void *output, ptls_iovec_t *input, size_t incnt, uint64_t seq, |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1146 | const void *aad, size_t aadlen) |
Kazuho Oku | 3a50ee1 | 2022-04-27 16:20:16 +0900 | [diff] [blame] | 1147 | { |
| 1148 | assert(!"FIXME"); |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1149 | } |
| 1150 | |
| 1151 | static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq, |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1152 | const void *aad, size_t aadlen) |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1153 | { |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1154 | struct aesgcm_context *ctx = (void *)_ctx; |
| 1155 | |
| 1156 | if (inlen < 16) |
| 1157 | return SIZE_MAX; |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1158 | |
| 1159 | size_t enclen = inlen - 16; |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1160 | if (enclen + aadlen > ctx->aesgcm->capacity) |
| 1161 | ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, enclen + aadlen); |
| 1162 | if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad, aadlen, |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1163 | (const uint8_t *)input + enclen)) |
Kazuho Oku | 94feca2 | 2020-05-11 16:34:44 +0900 | [diff] [blame] | 1164 | return SIZE_MAX; |
| 1165 | return enclen; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1166 | } |
| 1167 | |
Kazuho Oku | bb2cac2 | 2023-02-13 16:09:08 +0900 | [diff] [blame] | 1168 | static inline void aesgcm_get_iv(ptls_aead_context_t *_ctx, void *iv) |
Christian Huitema | 4f8c485 | 2020-12-05 20:31:30 -0800 | [diff] [blame] | 1169 | { |
| 1170 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
Kazuho Oku | bb2cac2 | 2023-02-13 16:09:08 +0900 | [diff] [blame] | 1171 | |
| 1172 | __m128i m128 = _mm_shuffle_epi8(ctx->static_iv, byteswap128); |
| 1173 | storen128(iv, PTLS_AESGCM_IV_SIZE, m128); |
| 1174 | } |
| 1175 | |
| 1176 | static inline void aesgcm_set_iv(ptls_aead_context_t *_ctx, const void *iv) |
| 1177 | { |
| 1178 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
| 1179 | |
| 1180 | ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE); |
| 1181 | ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128); |
Christian Huitema | 4f8c485 | 2020-12-05 20:31:30 -0800 | [diff] [blame] | 1182 | } |
| 1183 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1184 | static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size) |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1185 | { |
| 1186 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
| 1187 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1188 | ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE); |
| 1189 | ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128); |
Kazuho Oku | ae2aeda | 2020-06-14 15:13:18 +0900 | [diff] [blame] | 1190 | if (key == NULL) |
| 1191 | return 0; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1192 | |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1193 | ctx->super.dispose_crypto = aesgcm_dispose_crypto; |
Kazuho Oku | bb2cac2 | 2023-02-13 16:09:08 +0900 | [diff] [blame] | 1194 | ctx->super.do_get_iv = aesgcm_get_iv; |
| 1195 | ctx->super.do_set_iv = aesgcm_set_iv; |
Kazuho Oku | ea42ef7 | 2022-05-02 07:40:01 +0900 | [diff] [blame] | 1196 | ctx->super.do_encrypt_init = aead_do_encrypt_init; |
| 1197 | ctx->super.do_encrypt_update = aead_do_encrypt_update; |
| 1198 | ctx->super.do_encrypt_final = aead_do_encrypt_final; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1199 | ctx->super.do_encrypt = aead_do_encrypt; |
Kazuho Oku | 3a50ee1 | 2022-04-27 16:20:16 +0900 | [diff] [blame] | 1200 | ctx->super.do_encrypt_v = aead_do_encrypt_v; |
Kazuho Oku | ba2b960 | 2020-05-14 08:21:39 +0900 | [diff] [blame] | 1201 | ctx->super.do_decrypt = aead_do_decrypt; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1202 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 1203 | ctx->aesgcm = new_aesgcm(key, key_size, 1500 /* assume ordinary packet size */, 0 /* no support for aesni256 yet */); |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1204 | |
| 1205 | return 0; |
| 1206 | } |
| 1207 | |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1208 | static int aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
| 1209 | { |
| 1210 | return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE); |
| 1211 | } |
| 1212 | |
| 1213 | static int aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
| 1214 | { |
| 1215 | return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE); |
| 1216 | } |
| 1217 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 1218 | int ptls_fusion_can_aesni256 = 0; |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1219 | ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR", |
| 1220 | PTLS_AES128_KEY_SIZE, |
| 1221 | 1, // block size |
| 1222 | PTLS_AES_IV_SIZE, |
| 1223 | sizeof(struct ctr_context), |
| 1224 | aes128ctr_setup}; |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1225 | ptls_cipher_algorithm_t ptls_fusion_aes256ctr = {"AES256-CTR", |
| 1226 | PTLS_AES256_KEY_SIZE, |
| 1227 | 1, // block size |
| 1228 | PTLS_AES_IV_SIZE, |
| 1229 | sizeof(struct ctr_context), |
| 1230 | aes256ctr_setup}; |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1231 | ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM", |
Christian Huitema | 11b75d5 | 2020-09-11 23:01:38 -0700 | [diff] [blame] | 1232 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 1233 | PTLS_AESGCM_INTEGRITY_LIMIT, |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1234 | &ptls_fusion_aes128ctr, |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1235 | NULL, // &ptls_fusion_aes128ecb, |
| 1236 | PTLS_AES128_KEY_SIZE, |
| 1237 | PTLS_AESGCM_IV_SIZE, |
| 1238 | PTLS_AESGCM_TAG_SIZE, |
Kazuho Oku | 93944ce | 2022-07-06 16:41:08 +0900 | [diff] [blame] | 1239 | {0}, // while it may work, no reason to support TLS/1.2 |
Kazuho Oku | 9dc6982 | 2022-06-08 13:25:52 +0900 | [diff] [blame] | 1240 | 0, |
Kazuho Oku | 1edf707 | 2022-07-11 10:30:43 +0900 | [diff] [blame] | 1241 | 0, |
Kazuho Oku | 32f6c7b | 2020-05-05 22:14:41 +0900 | [diff] [blame] | 1242 | sizeof(struct aesgcm_context), |
Kazuho Oku | 9f2fb30 | 2020-05-11 13:13:26 +0900 | [diff] [blame] | 1243 | aes128gcm_setup}; |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1244 | ptls_aead_algorithm_t ptls_fusion_aes256gcm = {"AES256-GCM", |
Christian Huitema | 11b75d5 | 2020-09-11 23:01:38 -0700 | [diff] [blame] | 1245 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 1246 | PTLS_AESGCM_INTEGRITY_LIMIT, |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1247 | &ptls_fusion_aes256ctr, |
| 1248 | NULL, // &ptls_fusion_aes256ecb, |
| 1249 | PTLS_AES256_KEY_SIZE, |
| 1250 | PTLS_AESGCM_IV_SIZE, |
| 1251 | PTLS_AESGCM_TAG_SIZE, |
Kazuho Oku | 93944ce | 2022-07-06 16:41:08 +0900 | [diff] [blame] | 1252 | {0}, // while it may work, no reason to support TLS/1.2 |
Kazuho Oku | 9dc6982 | 2022-06-08 13:25:52 +0900 | [diff] [blame] | 1253 | 0, |
Kazuho Oku | 1edf707 | 2022-07-11 10:30:43 +0900 | [diff] [blame] | 1254 | 0, |
Kazuho Oku | 6b84978 | 2020-05-15 11:44:21 +0900 | [diff] [blame] | 1255 | sizeof(struct aesgcm_context), |
| 1256 | aes256gcm_setup}; |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 1257 | |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1258 | static inline size_t calc_total_length(ptls_iovec_t *input, size_t incnt) |
| 1259 | { |
| 1260 | size_t totlen = 0; |
| 1261 | for (size_t i = 0; i < incnt; ++i) |
| 1262 | totlen += input[i].len; |
| 1263 | return totlen; |
| 1264 | } |
| 1265 | |
Kazuho Oku | a7006dc | 2022-05-09 17:24:31 +0900 | [diff] [blame] | 1266 | static inline void reduce_aad128(struct ptls_fusion_gfmul_state128 *gstate, struct ptls_fusion_aesgcm_ghash_precompute128 *ghash, |
| 1267 | const void *_aad, size_t aadlen) |
| 1268 | { |
| 1269 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute; |
| 1270 | const uint8_t *aad = _aad; |
| 1271 | |
| 1272 | while (PTLS_UNLIKELY(aadlen >= 6 * 16)) { |
| 1273 | ghash_precompute = ghash + 6; |
| 1274 | gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1275 | aad += 16; |
| 1276 | aadlen -= 16; |
| 1277 | for (int i = 1; i < 6; ++i) { |
| 1278 | gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1279 | aad += 16; |
| 1280 | aadlen -= 16; |
| 1281 | } |
| 1282 | gfmul_reduce128(gstate); |
| 1283 | } |
| 1284 | |
| 1285 | if (PTLS_LIKELY(aadlen != 0)) { |
| 1286 | ghash_precompute = ghash + (aadlen + 15) / 16; |
| 1287 | if (PTLS_UNLIKELY(aadlen >= 16)) { |
| 1288 | gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1289 | aad += 16; |
| 1290 | aadlen -= 16; |
| 1291 | while (aadlen >= 16) { |
| 1292 | gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute); |
| 1293 | aad += 16; |
| 1294 | aadlen -= 16; |
| 1295 | } |
| 1296 | if (PTLS_LIKELY(aadlen != 0)) |
| 1297 | gfmul_nextstep128(gstate, loadn128(aad, aadlen), --ghash_precompute); |
| 1298 | } else { |
| 1299 | gfmul_firststep128(gstate, loadn128(aad, aadlen), --ghash_precompute); |
| 1300 | } |
| 1301 | assert(ghash == ghash_precompute); |
| 1302 | gfmul_reduce128(gstate); |
| 1303 | } |
| 1304 | } |
| 1305 | |
Kazuho Oku | 13ced82 | 2022-05-10 10:09:54 +0900 | [diff] [blame] | 1306 | NO_SANITIZE_ADDRESS |
| 1307 | static inline uint8_t *load_preceding_unaligned(uint8_t *encbuf, uint8_t **output) |
| 1308 | { |
| 1309 | uint8_t *encp; |
| 1310 | |
| 1311 | if ((encp = encbuf + ((uintptr_t)*output & 63)) != encbuf) { |
| 1312 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(*output - (encp - encbuf)))); |
| 1313 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(*output - (encp - encbuf) + 32))); |
| 1314 | *output -= encp - encbuf; |
| 1315 | } |
| 1316 | |
| 1317 | return encp; |
| 1318 | } |
| 1319 | |
| 1320 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1321 | static inline void write_remaining_bytes(uint8_t *dst, const uint8_t *src, const uint8_t *end) |
| 1322 | { |
| 1323 | /* Write in 64-byte chunks, using NT store instructions. Last partial block, if any, is written to cache, as that cache line |
| 1324 | * would likely be read when the next TLS record is being built. */ |
| 1325 | |
| 1326 | for (; end - src >= 64; dst += 64, src += 64) { |
| 1327 | _mm256_stream_si256((void *)dst, _mm256_load_si256((void *)src)); |
| 1328 | _mm256_stream_si256((void *)(dst + 32), _mm256_load_si256((void *)(src + 32))); |
| 1329 | } |
| 1330 | _mm_sfence(); /* weakly ordered writes have to be synced before being passed to NIC */ |
| 1331 | if (src != end) { |
| 1332 | for (; end - src >= 16; dst += 16, src += 16) |
| 1333 | _mm_store_si128((void *)dst, _mm_load_si128((void *)src)); |
| 1334 | if (src != end) |
| 1335 | storen128((void *)dst, end - src, loadn128((void *)src, end - src)); |
| 1336 | } |
| 1337 | } |
| 1338 | |
Kazuho Oku | 4543982 | 2022-05-10 10:16:49 +0900 | [diff] [blame] | 1339 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1340 | static void non_temporal_encrypt_v128(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt, |
Kazuho Oku | a7006dc | 2022-05-09 17:24:31 +0900 | [diff] [blame] | 1341 | uint64_t seq, const void *aad, size_t aadlen) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1342 | { |
| 1343 | /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ |
| 1344 | #define AESECB6_INIT() \ |
| 1345 | do { \ |
| 1346 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1347 | bits0 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1348 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1349 | bits1 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1350 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1351 | bits2 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1352 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1353 | bits3 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1354 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1355 | bits4 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1356 | if (PTLS_LIKELY(srclen > 16 * 5) || src_vecleft != 0) { \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1357 | ctr = _mm_add_epi64(ctr, one8); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1358 | bits5 = _mm_shuffle_epi8(ctr, byteswap128); \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1359 | } else { \ |
Kazuho Oku | 4f6bcae | 2022-05-09 07:02:13 +0900 | [diff] [blame] | 1360 | bits5 = ek0; \ |
| 1361 | state |= STATE_EK0_READY; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1362 | } \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1363 | __m128i k = ctx->super.ecb.keys.m128[0]; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1364 | bits0 = _mm_xor_si128(bits0, k); \ |
| 1365 | bits1 = _mm_xor_si128(bits1, k); \ |
| 1366 | bits2 = _mm_xor_si128(bits2, k); \ |
| 1367 | bits3 = _mm_xor_si128(bits3, k); \ |
| 1368 | bits4 = _mm_xor_si128(bits4, k); \ |
| 1369 | bits5 = _mm_xor_si128(bits5, k); \ |
| 1370 | } while (0) |
| 1371 | |
| 1372 | /* aes block update */ |
| 1373 | #define AESECB6_UPDATE(i) \ |
| 1374 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1375 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1376 | bits0 = _mm_aesenc_si128(bits0, k); \ |
| 1377 | bits1 = _mm_aesenc_si128(bits1, k); \ |
| 1378 | bits2 = _mm_aesenc_si128(bits2, k); \ |
| 1379 | bits3 = _mm_aesenc_si128(bits3, k); \ |
| 1380 | bits4 = _mm_aesenc_si128(bits4, k); \ |
| 1381 | bits5 = _mm_aesenc_si128(bits5, k); \ |
| 1382 | } while (0) |
| 1383 | |
| 1384 | /* aesenclast */ |
| 1385 | #define AESECB6_FINAL(i) \ |
| 1386 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1387 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1388 | bits0 = _mm_aesenclast_si128(bits0, k); \ |
| 1389 | bits1 = _mm_aesenclast_si128(bits1, k); \ |
| 1390 | bits2 = _mm_aesenclast_si128(bits2, k); \ |
| 1391 | bits3 = _mm_aesenclast_si128(bits3, k); \ |
| 1392 | bits4 = _mm_aesenclast_si128(bits4, k); \ |
| 1393 | bits5 = _mm_aesenclast_si128(bits5, k); \ |
| 1394 | } while (0) |
| 1395 | |
| 1396 | struct aesgcm_context *agctx = (void *)_ctx; |
Kazuho Oku | 7a0685d | 2022-05-04 15:39:57 +0900 | [diff] [blame] | 1397 | uint8_t *output = _output; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1398 | |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1399 | #define STATE_EK0_READY 0x1 |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1400 | #define STATE_COPY_128B 0x2 |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1401 | int32_t state = 0; |
| 1402 | |
Kazuho Oku | 9f8e12a | 2022-05-11 08:21:45 +0900 | [diff] [blame] | 1403 | /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */ |
| 1404 | uint8_t encbuf[32 * 6] __attribute__((aligned(32))), *encp; |
Kazuho Oku | 122a334 | 2022-05-04 14:08:47 +0900 | [diff] [blame] | 1405 | |
| 1406 | /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is |
| 1407 | * append to the ciphertext before writing the bytes to main memory using NT store instructions. */ |
| 1408 | PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 16 + 16); |
| 1409 | |
Kazuho Oku | 13ced82 | 2022-05-10 10:09:54 +0900 | [diff] [blame] | 1410 | /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */ |
| 1411 | encp = load_preceding_unaligned(encbuf, &output); |
| 1412 | |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1413 | /* First write would be 128 bytes (32+6*16), if encbuf contains no less than 32 bytes already. */ |
| 1414 | if (encp - encbuf >= 32) |
| 1415 | state |= STATE_COPY_128B; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1416 | |
| 1417 | /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */ |
| 1418 | __m128i ctr = calc_counter(agctx, seq); |
| 1419 | ctr = _mm_insert_epi32(ctr, 1, 0); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1420 | __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128); |
| 1421 | __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1422 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1423 | struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1424 | __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1425 | struct ptls_fusion_gfmul_state128 gstate = {0}; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1426 | |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1427 | /* find the first non-empty vec */ |
| 1428 | const uint8_t *src = NULL; |
| 1429 | size_t srclen = 0, src_vecleft = incnt; |
| 1430 | while (srclen == 0 && src_vecleft != 0) { |
| 1431 | src = (void *)input[0].base; |
| 1432 | srclen = input[0].len; |
| 1433 | ++input; |
| 1434 | --src_vecleft; |
| 1435 | } |
| 1436 | |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1437 | /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */ |
| 1438 | AESECB6_INIT(); |
| 1439 | AESECB6_UPDATE(1); |
| 1440 | AESECB6_UPDATE(2); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1441 | reduce_aad128(&gstate, ctx->ghash, aad, aadlen); |
| 1442 | for (size_t i = 3; i < ctx->super.ecb.rounds; ++i) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1443 | AESECB6_UPDATE(i); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1444 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1445 | |
| 1446 | /* Main loop. This loop: |
| 1447 | * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf, |
| 1448 | * 2. then if there is no more data to be encrypted, exit the loop, otherwise, |
| 1449 | * 3. calculate ghash of the blocks being written to encbuf, |
| 1450 | * 4. calculate next 6 * 16 bytes of keystream, |
| 1451 | * 5. writes encbuf in 64-byte blocks |
| 1452 | * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be |
| 1453 | * calculated. */ |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1454 | size_t remaining_ghash_from = encp - encbuf; |
Kazuho Oku | 3b2ab61 | 2022-05-09 06:41:48 +0900 | [diff] [blame] | 1455 | if (srclen != 0) { |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1456 | while (1) { |
| 1457 | /* apply the bit stream to input, writing to encbuf */ |
| 1458 | if (PTLS_LIKELY(srclen >= 6 * 16)) { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1459 | #define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(src + i * 16)), bits##i)) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1460 | APPLY(0); |
| 1461 | APPLY(1); |
| 1462 | APPLY(2); |
| 1463 | APPLY(3); |
| 1464 | APPLY(4); |
| 1465 | APPLY(5); |
| 1466 | #undef APPLY |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1467 | encp += 6 * 16; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1468 | src += 6 * 16; |
| 1469 | srclen -= 6 * 16; |
| 1470 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1471 | if (src_vecleft == 0) { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1472 | remaining_ghash_from = (encp - encbuf) - 96; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1473 | break; |
| 1474 | } |
| 1475 | src = (void *)input[0].base; |
| 1476 | srclen = input[0].len; |
| 1477 | ++input; |
| 1478 | --src_vecleft; |
| 1479 | } |
| 1480 | } else { |
| 1481 | /* slow path, load at most 6 * 16 bytes to encbuf then encrypt in-place */ |
| 1482 | size_t bytes_copied = 0; |
| 1483 | do { |
Kazuho Oku | fa3cd32 | 2022-05-08 17:05:17 +0900 | [diff] [blame] | 1484 | if (srclen >= 16 && bytes_copied < 5 * 16) { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1485 | _mm_storeu_si128((void *)(encp + bytes_copied), _mm_loadu_si128((void *)src)); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1486 | bytes_copied += 16; |
| 1487 | src += 16; |
| 1488 | srclen -= 16; |
| 1489 | } else { |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1490 | encp[bytes_copied++] = *src++; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1491 | --srclen; |
| 1492 | } |
| 1493 | if (PTLS_UNLIKELY(srclen == 0)) { |
Kazuho Oku | 78e6c3a | 2022-06-29 11:17:06 +0900 | [diff] [blame] | 1494 | do { |
| 1495 | if (src_vecleft == 0) |
| 1496 | break; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1497 | src = (void *)input[0].base; |
| 1498 | srclen = input[0].len; |
| 1499 | ++input; |
| 1500 | --src_vecleft; |
Kazuho Oku | 78e6c3a | 2022-06-29 11:17:06 +0900 | [diff] [blame] | 1501 | } while (srclen == 0); |
| 1502 | if (srclen == 0) |
| 1503 | break; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1504 | } |
| 1505 | } while (bytes_copied < 6 * 16); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1506 | #define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(encp + i * 16)), bits##i)) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1507 | APPLY(0); |
| 1508 | APPLY(1); |
| 1509 | APPLY(2); |
| 1510 | APPLY(3); |
| 1511 | APPLY(4); |
| 1512 | APPLY(5); |
| 1513 | #undef APPLY |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1514 | encp += bytes_copied; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1515 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1516 | /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it |
| 1517 | * will be fed into ghash. */ |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1518 | remaining_ghash_from = (encp - encbuf) - bytes_copied; |
Kazuho Oku | 7fb163f | 2022-05-01 14:09:56 +0900 | [diff] [blame] | 1519 | if ((bytes_copied & 15) != 0) |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1520 | _mm_storeu_si128((void *)encp, _mm_setzero_si128()); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1521 | break; |
| 1522 | } |
| 1523 | } |
| 1524 | |
| 1525 | /* Next 96-byte block starts here. Run AES and ghash in while writing output using non-temporal stores in 64-byte |
| 1526 | * blocks. */ |
| 1527 | AESECB6_INIT(); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1528 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + 6; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1529 | gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encp - 6 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1530 | AESECB6_UPDATE(1); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1531 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 5 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1532 | AESECB6_UPDATE(2); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1533 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 4 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1534 | AESECB6_UPDATE(3); |
Kazuho Oku | 7da0917 | 2022-04-30 23:54:59 +0900 | [diff] [blame] | 1535 | _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf)); |
| 1536 | _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32))); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1537 | AESECB6_UPDATE(4); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1538 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 3 * 16)), --ghash_precompute); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1539 | AESECB6_UPDATE(5); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1540 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 2 * 16)), --ghash_precompute); |
Kazuho Oku | b854db9 | 2022-04-30 23:44:34 +0900 | [diff] [blame] | 1541 | AESECB6_UPDATE(6); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1542 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 1 * 16)), --ghash_precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1543 | AESECB6_UPDATE(7); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1544 | if ((state & STATE_COPY_128B) != 0) { |
Kazuho Oku | 7da0917 | 2022-04-30 23:54:59 +0900 | [diff] [blame] | 1545 | _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64))); |
| 1546 | _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96))); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1547 | output += 128; |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1548 | encp -= 128; |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1549 | AESECB6_UPDATE(8); |
Kazuho Oku | 7a0685d | 2022-05-04 15:39:57 +0900 | [diff] [blame] | 1550 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 128))); |
| 1551 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 160))); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1552 | } else { |
| 1553 | output += 64; |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1554 | encp -= 64; |
Kazuho Oku | 7a0685d | 2022-05-04 15:39:57 +0900 | [diff] [blame] | 1555 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 64))); |
| 1556 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 96))); |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1557 | AESECB6_UPDATE(8); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1558 | } |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1559 | state ^= STATE_COPY_128B; |
Kazuho Oku | 791036a | 2022-05-02 11:51:37 +0900 | [diff] [blame] | 1560 | AESECB6_UPDATE(9); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1561 | if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) { |
| 1562 | for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i) |
Kazuho Oku | 59983e9 | 2022-05-02 12:04:47 +0900 | [diff] [blame] | 1563 | AESECB6_UPDATE(i); |
| 1564 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1565 | assert(ctx->ghash == ghash_precompute); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1566 | gfmul_reduce128(&gstate); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1567 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1568 | } |
| 1569 | } |
| 1570 | |
| 1571 | /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */ |
| 1572 | |
| 1573 | { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7 |
| 1574 | * blocks at once. */ |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1575 | size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16; |
Kazuho Oku | 7fb163f | 2022-05-01 14:09:56 +0900 | [diff] [blame] | 1576 | _mm_storeu_si128((void *)(encbuf + ac_off), ac); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1577 | size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */ |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1578 | assert(blocks <= 7); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1579 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + blocks; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1580 | gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1581 | remaining_ghash_from += 16; |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1582 | while (ghash_precompute != ctx->ghash) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1583 | gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1584 | remaining_ghash_from += 16; |
| 1585 | } |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1586 | gfmul_reduce128(&gstate); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1587 | } |
| 1588 | |
| 1589 | /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */ |
| 1590 | if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1591 | bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]); |
| 1592 | for (size_t i = 1; i < ctx->super.ecb.rounds; ++i) |
| 1593 | bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]); |
| 1594 | bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1595 | } |
| 1596 | |
| 1597 | /* append tag to encbuf */ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1598 | _mm_storeu_si128((void *)encp, gfmul_get_tag128(&gstate, bits5)); |
Kazuho Oku | e0caecc | 2022-05-04 10:09:24 +0900 | [diff] [blame] | 1599 | encp += 16; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1600 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1601 | /* write remaining bytes */ |
| 1602 | write_remaining_bytes(output, encbuf, encp); |
| 1603 | |
| 1604 | #undef AESECB6_INIT |
| 1605 | #undef AESECB6_UPDATE |
| 1606 | #undef AESECB6_FINAL |
| 1607 | #undef STATE_EK0_READY |
| 1608 | #undef STATE_COPY_128B |
| 1609 | } |
| 1610 | |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1611 | static size_t non_temporal_decrypt128(ptls_aead_context_t *_ctx, void *_output, const void *_input, size_t inlen, uint64_t seq, |
| 1612 | const void *aad, size_t aadlen) |
| 1613 | { |
| 1614 | /* Bail out if the input is too short, or remove tag from range. */ |
| 1615 | if (inlen < 16) |
| 1616 | return SIZE_MAX; |
| 1617 | inlen -= 16; |
| 1618 | size_t textlen = inlen; |
| 1619 | |
| 1620 | /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ |
| 1621 | #define AESECB6_INIT() \ |
| 1622 | do { \ |
| 1623 | ctr = _mm_add_epi64(ctr, one8); \ |
| 1624 | bits0 = _mm_shuffle_epi8(ctr, byteswap128); \ |
| 1625 | ctr = _mm_add_epi64(ctr, one8); \ |
| 1626 | bits1 = _mm_shuffle_epi8(ctr, byteswap128); \ |
| 1627 | ctr = _mm_add_epi64(ctr, one8); \ |
| 1628 | bits2 = _mm_shuffle_epi8(ctr, byteswap128); \ |
| 1629 | ctr = _mm_add_epi64(ctr, one8); \ |
| 1630 | bits3 = _mm_shuffle_epi8(ctr, byteswap128); \ |
| 1631 | ctr = _mm_add_epi64(ctr, one8); \ |
| 1632 | bits4 = _mm_shuffle_epi8(ctr, byteswap128); \ |
| 1633 | if (PTLS_LIKELY(inlen > 16 * 5)) { \ |
| 1634 | ctr = _mm_add_epi64(ctr, one8); \ |
| 1635 | bits5 = _mm_shuffle_epi8(ctr, byteswap128); \ |
| 1636 | } else { \ |
| 1637 | bits5 = ek0; \ |
| 1638 | state |= STATE_EK0_READY; \ |
| 1639 | } \ |
| 1640 | __m128i k = ctx->super.ecb.keys.m128[0]; \ |
| 1641 | bits0 = _mm_xor_si128(bits0, k); \ |
| 1642 | bits1 = _mm_xor_si128(bits1, k); \ |
| 1643 | bits2 = _mm_xor_si128(bits2, k); \ |
| 1644 | bits3 = _mm_xor_si128(bits3, k); \ |
| 1645 | bits4 = _mm_xor_si128(bits4, k); \ |
| 1646 | bits5 = _mm_xor_si128(bits5, k); \ |
| 1647 | } while (0) |
| 1648 | |
| 1649 | /* aes block update */ |
| 1650 | #define AESECB6_UPDATE(i) \ |
| 1651 | do { \ |
| 1652 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
| 1653 | bits0 = _mm_aesenc_si128(bits0, k); \ |
| 1654 | bits1 = _mm_aesenc_si128(bits1, k); \ |
| 1655 | bits2 = _mm_aesenc_si128(bits2, k); \ |
| 1656 | bits3 = _mm_aesenc_si128(bits3, k); \ |
| 1657 | bits4 = _mm_aesenc_si128(bits4, k); \ |
| 1658 | bits5 = _mm_aesenc_si128(bits5, k); \ |
| 1659 | } while (0) |
| 1660 | |
| 1661 | /* aesenclast */ |
| 1662 | #define AESECB6_FINAL(i) \ |
| 1663 | do { \ |
| 1664 | __m128i k = ctx->super.ecb.keys.m128[i]; \ |
| 1665 | bits0 = _mm_aesenclast_si128(bits0, k); \ |
| 1666 | bits1 = _mm_aesenclast_si128(bits1, k); \ |
| 1667 | bits2 = _mm_aesenclast_si128(bits2, k); \ |
| 1668 | bits3 = _mm_aesenclast_si128(bits3, k); \ |
| 1669 | bits4 = _mm_aesenclast_si128(bits4, k); \ |
| 1670 | bits5 = _mm_aesenclast_si128(bits5, k); \ |
| 1671 | } while (0) |
| 1672 | |
| 1673 | struct aesgcm_context *agctx = (void *)_ctx; |
| 1674 | uint8_t *output = _output; |
| 1675 | const uint8_t *input = _input; |
| 1676 | |
| 1677 | #define STATE_EK0_READY 0x1 |
| 1678 | int32_t state = 0; |
| 1679 | |
| 1680 | /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */ |
| 1681 | __m128i ctr = calc_counter(agctx, seq); |
| 1682 | ctr = _mm_insert_epi32(ctr, 1, 0); |
| 1683 | __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128); |
| 1684 | __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128); |
| 1685 | |
| 1686 | struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm; |
| 1687 | __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128(); |
| 1688 | struct ptls_fusion_gfmul_state128 gstate = {0}; |
| 1689 | |
| 1690 | /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */ |
| 1691 | AESECB6_INIT(); |
| 1692 | AESECB6_UPDATE(1); |
| 1693 | AESECB6_UPDATE(2); |
| 1694 | reduce_aad128(&gstate, ctx->ghash, aad, aadlen); |
| 1695 | for (size_t i = 3; i < ctx->super.ecb.rounds; ++i) |
| 1696 | AESECB6_UPDATE(i); |
| 1697 | AESECB6_FINAL(ctx->super.ecb.rounds); |
| 1698 | |
| 1699 | /* Main loop. Operate in full blocks (6 * 16 bytes). */ |
| 1700 | while (PTLS_LIKELY(inlen >= 6 * 16)) { |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1701 | #define DECRYPT(i) _mm_storeu_si128((void *)(output + i * 16), _mm_xor_si128(bits##i, _mm_loadu_si128((void *)(input + i * 16)))) |
| 1702 | DECRYPT(0); |
| 1703 | DECRYPT(1); |
| 1704 | DECRYPT(2); |
| 1705 | DECRYPT(3); |
| 1706 | DECRYPT(4); |
| 1707 | DECRYPT(5); |
| 1708 | #undef DECRYPT |
| 1709 | #define GFMUL_NEXT(i) gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(input + i * 16)), ctx->ghash + 5 - i) |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1710 | AESECB6_INIT(); |
| 1711 | AESECB6_UPDATE(1); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1712 | AESECB6_UPDATE(2); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1713 | AESECB6_UPDATE(3); |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1714 | gfmul_firststep128(&gstate, _mm_loadu_si128((void *)input), ctx->ghash + 5); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1715 | AESECB6_UPDATE(4); |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1716 | GFMUL_NEXT(1); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1717 | AESECB6_UPDATE(5); |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1718 | GFMUL_NEXT(2); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1719 | AESECB6_UPDATE(6); |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1720 | GFMUL_NEXT(3); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1721 | AESECB6_UPDATE(7); |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1722 | GFMUL_NEXT(4); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1723 | AESECB6_UPDATE(8); |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1724 | GFMUL_NEXT(5); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1725 | AESECB6_UPDATE(9); |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1726 | gfmul_reduce128(&gstate); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1727 | if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) { |
| 1728 | size_t i = 10; |
| 1729 | do { |
| 1730 | AESECB6_UPDATE(i); |
| 1731 | } while (++i < ctx->super.ecb.rounds); |
| 1732 | } |
| 1733 | AESECB6_FINAL(ctx->super.ecb.rounds); |
| 1734 | output += 6 * 16; |
| 1735 | input += 6 * 16; |
| 1736 | inlen -= 6 * 16; |
Kazuho Oku | 908f00a | 2022-05-11 08:52:16 +0900 | [diff] [blame] | 1737 | #undef GFMUL_NEXT |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 1738 | } |
| 1739 | |
| 1740 | /* Decrypt the remainder as well as finishing GHASH calculation. */ |
| 1741 | if (inlen != 0) { |
| 1742 | struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (inlen + 15) / 16 + 1; |
| 1743 | #define ONEBLOCK(i) \ |
| 1744 | do { \ |
| 1745 | if (inlen != 0) { \ |
| 1746 | __m128i b = inlen >= 16 ? _mm_loadu_si128((void *)input) : loadn128(input, inlen); \ |
| 1747 | if (i == 0) { \ |
| 1748 | gfmul_firststep128(&gstate, b, --ghash_precompute); \ |
| 1749 | } else { \ |
| 1750 | gfmul_nextstep128(&gstate, b, --ghash_precompute); \ |
| 1751 | } \ |
| 1752 | b = _mm_xor_si128(b, bits##i); \ |
| 1753 | if (inlen >= 16) { \ |
| 1754 | _mm_storeu_si128((void *)output, b); \ |
| 1755 | output += 16; \ |
| 1756 | input += 16; \ |
| 1757 | inlen -= 16; \ |
| 1758 | } else { \ |
| 1759 | storen128(output, inlen, b); \ |
| 1760 | output += inlen; \ |
| 1761 | input += inlen; \ |
| 1762 | inlen = 0; \ |
| 1763 | } \ |
| 1764 | } \ |
| 1765 | } while (0) |
| 1766 | ONEBLOCK(0); |
| 1767 | ONEBLOCK(1); |
| 1768 | ONEBLOCK(2); |
| 1769 | ONEBLOCK(3); |
| 1770 | ONEBLOCK(4); |
| 1771 | ONEBLOCK(5); |
| 1772 | #undef ONEBLOCK |
| 1773 | gfmul_nextstep128(&gstate, ac, --ghash_precompute); |
| 1774 | assert(ghash_precompute == ctx->ghash); |
| 1775 | } else { |
| 1776 | gfmul_firststep128(&gstate, ac, ctx->ghash); |
| 1777 | } |
| 1778 | gfmul_reduce128(&gstate); |
| 1779 | |
| 1780 | /* Calculate EK0 if not yet available in bits5. */ |
| 1781 | if ((state & STATE_EK0_READY) == 0) { |
| 1782 | bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]); |
| 1783 | for (size_t i = 1; i < ctx->super.ecb.rounds; ++i) |
| 1784 | bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]); |
| 1785 | bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]); |
| 1786 | } |
| 1787 | |
| 1788 | /* Calculate GCM tag and compare. */ |
| 1789 | __m128i calctag = gfmul_get_tag128(&gstate, bits5); |
| 1790 | __m128i recvtag = _mm_loadu_si128((void *)input); |
| 1791 | if (_mm_movemask_epi8(_mm_cmpeq_epi8(calctag, recvtag)) != 0xffff) |
| 1792 | return SIZE_MAX; |
| 1793 | |
| 1794 | return textlen; |
| 1795 | |
| 1796 | #undef AESECB6_INIT |
| 1797 | #undef AESECB6_UPDATE |
| 1798 | #undef AESECB6_FINAL |
| 1799 | #undef STATE_EK0_READY |
| 1800 | } |
| 1801 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1802 | NO_SANITIZE_ADDRESS |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 1803 | static void non_temporal_encrypt_v256(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt, |
| 1804 | uint64_t seq, const void *_aad, size_t aadlen) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1805 | { |
| 1806 | /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */ |
| 1807 | #define AESECB6_INIT() \ |
| 1808 | do { \ |
| 1809 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1810 | bits0 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1811 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1812 | bits1 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1813 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1814 | bits2 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1815 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1816 | bits3 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1817 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1818 | bits4 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1819 | ctr = _mm256_add_epi64(ctr, incr128x2); \ |
| 1820 | bits5 = _mm256_shuffle_epi8(ctr, byteswap256); \ |
| 1821 | if (PTLS_UNLIKELY(srclen <= 32 * 6 - 16) && src_vecleft == 0) { \ |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 1822 | bits5 = _mm256_permute2f128_si256(bits5, ac_ek0, 0x30); \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1823 | state |= STATE_EK0_READY; \ |
| 1824 | } \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1825 | __m256i k = ctx->super.ecb.keys.m256[0]; \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1826 | bits0 = _mm256_xor_si256(bits0, k); \ |
| 1827 | bits1 = _mm256_xor_si256(bits1, k); \ |
| 1828 | bits2 = _mm256_xor_si256(bits2, k); \ |
| 1829 | bits3 = _mm256_xor_si256(bits3, k); \ |
| 1830 | bits4 = _mm256_xor_si256(bits4, k); \ |
| 1831 | bits5 = _mm256_xor_si256(bits5, k); \ |
| 1832 | } while (0) |
| 1833 | |
| 1834 | /* aes block update */ |
| 1835 | #define AESECB6_UPDATE(i) \ |
| 1836 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1837 | __m256i k = ctx->super.ecb.keys.m256[i]; \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1838 | bits0 = _mm256_aesenc_epi128(bits0, k); \ |
| 1839 | bits1 = _mm256_aesenc_epi128(bits1, k); \ |
| 1840 | bits2 = _mm256_aesenc_epi128(bits2, k); \ |
| 1841 | bits3 = _mm256_aesenc_epi128(bits3, k); \ |
| 1842 | bits4 = _mm256_aesenc_epi128(bits4, k); \ |
| 1843 | bits5 = _mm256_aesenc_epi128(bits5, k); \ |
| 1844 | } while (0) |
| 1845 | |
| 1846 | /* aesenclast */ |
| 1847 | #define AESECB6_FINAL(i) \ |
| 1848 | do { \ |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1849 | __m256i k = ctx->super.ecb.keys.m256[i]; \ |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1850 | bits0 = _mm256_aesenclast_epi128(bits0, k); \ |
| 1851 | bits1 = _mm256_aesenclast_epi128(bits1, k); \ |
| 1852 | bits2 = _mm256_aesenclast_epi128(bits2, k); \ |
| 1853 | bits3 = _mm256_aesenclast_epi128(bits3, k); \ |
| 1854 | bits4 = _mm256_aesenclast_epi128(bits4, k); \ |
| 1855 | bits5 = _mm256_aesenclast_epi128(bits5, k); \ |
| 1856 | } while (0) |
| 1857 | |
| 1858 | struct aesgcm_context *agctx = (void *)_ctx; |
| 1859 | uint8_t *output = _output; |
| 1860 | const uint8_t *aad = _aad; |
| 1861 | |
| 1862 | #define STATE_EK0_READY 0x1 |
| 1863 | int32_t state = 0; |
| 1864 | |
| 1865 | /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */ |
| 1866 | uint8_t encbuf[32 * 9] __attribute__((aligned(32))), *encp; |
| 1867 | |
| 1868 | /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is |
| 1869 | * append to the ciphertext before writing the bytes to main memory using NT store instructions. */ |
| 1870 | PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 32 + 16); |
| 1871 | |
Kazuho Oku | 13ced82 | 2022-05-10 10:09:54 +0900 | [diff] [blame] | 1872 | /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */ |
| 1873 | encp = load_preceding_unaligned(encbuf, &output); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1874 | |
| 1875 | /* setup ctr, retaining Ek(0), len(A) | len(C) to be fed into GCM */ |
| 1876 | __m256i ctr = _mm256_broadcastsi128_si256(calc_counter(agctx, seq)); |
| 1877 | ctr = _mm256_insert_epi32(ctr, 1, 4); |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 1878 | __m256i ac_ek0 = _mm256_permute2f128_si256( |
| 1879 | /* first half: ac */ |
| 1880 | _mm256_castsi128_si256( |
| 1881 | _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128)), |
| 1882 | /* second half: ek0 */ |
| 1883 | _mm256_shuffle_epi8(ctr, byteswap256), 0x30); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1884 | |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1885 | struct ptls_fusion_aesgcm_context256 *ctx = (void *)agctx->aesgcm; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1886 | __m256i bits0, bits1, bits2, bits3, bits4, bits5 = _mm256_setzero_si256(); |
| 1887 | struct ptls_fusion_gfmul_state256 gstate = {0}; |
| 1888 | |
| 1889 | /* find the first non-empty vec */ |
| 1890 | const uint8_t *src = NULL; |
| 1891 | size_t srclen = 0, src_vecleft = incnt; |
| 1892 | while (srclen == 0 && src_vecleft != 0) { |
| 1893 | src = (void *)input[0].base; |
| 1894 | srclen = input[0].len; |
| 1895 | ++input; |
| 1896 | --src_vecleft; |
| 1897 | } |
| 1898 | |
| 1899 | /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */ |
| 1900 | AESECB6_INIT(); |
| 1901 | AESECB6_UPDATE(1); |
| 1902 | AESECB6_UPDATE(2); |
| 1903 | if (PTLS_LIKELY(aadlen != 0)) { |
| 1904 | union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute; |
| 1905 | while (PTLS_UNLIKELY(aadlen >= 6 * 32)) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1906 | ghash_precompute = ctx->ghash + 6; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1907 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute); |
| 1908 | aad += 32; |
| 1909 | aadlen -= 32; |
| 1910 | for (int i = 1; i < 6; ++i) { |
| 1911 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute); |
| 1912 | aad += 32; |
| 1913 | aadlen -= 32; |
| 1914 | } |
| 1915 | gfmul_reduce256(&gstate); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1916 | } |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1917 | if (PTLS_LIKELY(aadlen != 0)) { |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1918 | ghash_precompute = ctx->ghash + (aadlen + 31) / 32; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1919 | if (PTLS_UNLIKELY(aadlen >= 32)) { |
| 1920 | if (aadlen % 32 == 0 || aadlen % 32 > 16) { |
| 1921 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute); |
| 1922 | aad += 32; |
| 1923 | aadlen -= 32; |
| 1924 | } else { |
| 1925 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 1, --ghash_precompute); |
| 1926 | aad += 16; |
| 1927 | aadlen -= 16; |
| 1928 | } |
| 1929 | while (aadlen >= 32) { |
| 1930 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute); |
| 1931 | aad += 32; |
| 1932 | aadlen -= 32; |
| 1933 | } |
| 1934 | if (PTLS_LIKELY(aadlen != 0)) { |
| 1935 | assert(aadlen > 16); |
| 1936 | gfmul_nextstep256(&gstate, loadn256(aad, aadlen), --ghash_precompute); |
| 1937 | } |
| 1938 | } else { |
| 1939 | gfmul_firststep256(&gstate, loadn256(aad, aadlen), aadlen <= 16, --ghash_precompute); |
| 1940 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1941 | assert(ctx->ghash == ghash_precompute); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1942 | gfmul_reduce256(&gstate); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 1943 | } |
| 1944 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1945 | for (size_t i = 3; i < ctx->super.ecb.rounds; ++i) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1946 | AESECB6_UPDATE(i); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 1947 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 1948 | |
| 1949 | /* Main loop. This loop: |
| 1950 | * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf, |
| 1951 | * 2. then if there is no more data to be encrypted, exit the loop, otherwise, |
| 1952 | * 3. calculate ghash of the blocks being written to encbuf, |
| 1953 | * 4. calculate next 6 * 16 bytes of keystream, |
| 1954 | * 5. writes encbuf in 64-byte blocks |
| 1955 | * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be |
| 1956 | * calculated. */ |
| 1957 | size_t remaining_ghash_from = encp - encbuf; |
| 1958 | if (srclen != 0) { |
| 1959 | while (1) { |
| 1960 | /* apply the bit stream to input, writing to encbuf */ |
| 1961 | if (PTLS_LIKELY(srclen >= 6 * 32)) { |
| 1962 | #define APPLY(i) _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(src + i * 32)), bits##i)) |
| 1963 | APPLY(0); |
| 1964 | APPLY(1); |
| 1965 | APPLY(2); |
| 1966 | APPLY(3); |
| 1967 | APPLY(4); |
| 1968 | APPLY(5); |
| 1969 | #undef APPLY |
| 1970 | encp += 6 * 32; |
| 1971 | src += 6 * 32; |
| 1972 | srclen -= 6 * 32; |
| 1973 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 1974 | if (src_vecleft == 0) { |
| 1975 | remaining_ghash_from = (encp - encbuf) - 6 * 32; |
| 1976 | break; |
| 1977 | } |
| 1978 | src = (void *)input[0].base; |
| 1979 | srclen = input[0].len; |
| 1980 | ++input; |
| 1981 | --src_vecleft; |
| 1982 | } |
| 1983 | } else { |
| 1984 | /* slow path, load at most 6 * 32 bytes to encbuf then encrypt in-place */ |
| 1985 | size_t bytes_copied = 0; |
| 1986 | do { |
| 1987 | if (srclen >= 32 && bytes_copied < 5 * 32) { |
| 1988 | _mm256_storeu_si256((void *)(encp + bytes_copied), _mm256_loadu_si256((void *)src)); |
| 1989 | bytes_copied += 32; |
| 1990 | src += 32; |
| 1991 | srclen -= 32; |
| 1992 | } else { |
| 1993 | encp[bytes_copied++] = *src++; |
| 1994 | --srclen; |
| 1995 | } |
| 1996 | if (PTLS_UNLIKELY(srclen == 0)) { |
Kazuho Oku | 629b800 | 2022-06-29 12:31:35 +0900 | [diff] [blame] | 1997 | do { |
| 1998 | if (src_vecleft == 0) |
| 1999 | break; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2000 | src = (void *)input[0].base; |
| 2001 | srclen = input[0].len; |
| 2002 | ++input; |
| 2003 | --src_vecleft; |
Kazuho Oku | 629b800 | 2022-06-29 12:31:35 +0900 | [diff] [blame] | 2004 | } while (srclen == 0); |
| 2005 | if (srclen == 0) |
| 2006 | break; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2007 | } |
| 2008 | } while (bytes_copied < 6 * 32); |
| 2009 | #define APPLY(i) \ |
| 2010 | _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(encp + i * 32)), bits##i)) |
| 2011 | APPLY(0); |
| 2012 | APPLY(1); |
| 2013 | APPLY(2); |
| 2014 | APPLY(3); |
| 2015 | APPLY(4); |
| 2016 | APPLY(5); |
| 2017 | #undef APPLY |
| 2018 | encp += bytes_copied; |
| 2019 | if (PTLS_UNLIKELY(srclen == 0)) { |
| 2020 | /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it |
| 2021 | * will be fed into ghash. */ |
| 2022 | remaining_ghash_from = (encp - encbuf) - bytes_copied; |
| 2023 | if ((bytes_copied & 15) != 0) |
| 2024 | _mm_storeu_si128((void *)encp, _mm_setzero_si128()); |
| 2025 | break; |
| 2026 | } |
| 2027 | } |
| 2028 | |
| 2029 | /* Next 96-byte block starts here. Run AES and ghash in parallel while writing output using non-temporal store |
| 2030 | * instructions. */ |
| 2031 | AESECB6_INIT(); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 2032 | union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + 6; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2033 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encp - 6 * 32)), 0, --ghash_precompute); |
| 2034 | AESECB6_UPDATE(1); |
| 2035 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 5 * 32)), --ghash_precompute); |
| 2036 | AESECB6_UPDATE(2); |
| 2037 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 4 * 32)), --ghash_precompute); |
| 2038 | AESECB6_UPDATE(3); |
| 2039 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 3 * 32)), --ghash_precompute); |
| 2040 | AESECB6_UPDATE(4); |
| 2041 | _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf)); |
| 2042 | _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32))); |
| 2043 | _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64))); |
| 2044 | _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96))); |
| 2045 | _mm256_stream_si256((void *)(output + 128), _mm256_load_si256((void *)(encbuf + 128))); |
| 2046 | _mm256_stream_si256((void *)(output + 160), _mm256_load_si256((void *)(encbuf + 160))); |
| 2047 | AESECB6_UPDATE(5); |
| 2048 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 2 * 32)), --ghash_precompute); |
| 2049 | AESECB6_UPDATE(6); |
| 2050 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 1 * 32)), --ghash_precompute); |
| 2051 | output += 192; |
| 2052 | encp -= 192; |
| 2053 | AESECB6_UPDATE(7); |
| 2054 | _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 192))); |
| 2055 | AESECB6_UPDATE(8); |
| 2056 | _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 224))); |
| 2057 | AESECB6_UPDATE(9); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 2058 | if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) { |
| 2059 | for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i) |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2060 | AESECB6_UPDATE(i); |
| 2061 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 2062 | assert(ctx->ghash == ghash_precompute); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2063 | gfmul_reduce256(&gstate); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 2064 | AESECB6_FINAL(ctx->super.ecb.rounds); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2065 | } |
| 2066 | } |
| 2067 | |
| 2068 | /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */ |
| 2069 | |
| 2070 | { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7 |
| 2071 | * blocks at once. */ |
| 2072 | size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16; |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 2073 | _mm_storeu_si128((void *)(encbuf + ac_off), _mm256_castsi256_si128(ac_ek0)); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2074 | size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */ |
| 2075 | assert(blocks <= 13); |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 2076 | union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + blocks / 2; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2077 | if (blocks % 2 != 0) { |
| 2078 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 1, ghash_precompute); |
| 2079 | remaining_ghash_from += 16; |
| 2080 | } else { |
| 2081 | gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 0, --ghash_precompute); |
| 2082 | remaining_ghash_from += 32; |
| 2083 | } |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 2084 | while (ghash_precompute != ctx->ghash) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2085 | gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), --ghash_precompute); |
| 2086 | remaining_ghash_from += 32; |
| 2087 | } |
| 2088 | gfmul_reduce256(&gstate); |
| 2089 | } |
| 2090 | |
| 2091 | /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */ |
| 2092 | if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) { |
Kazuho Oku | fd7d5c1 | 2022-05-09 21:37:38 +0900 | [diff] [blame] | 2093 | bits5 = ac_ek0; |
Kazuho Oku | 07f37c2 | 2022-05-10 09:45:42 +0900 | [diff] [blame] | 2094 | bits5 = _mm256_xor_si256(bits5, ctx->super.ecb.keys.m256[0]); |
| 2095 | for (size_t i = 1; i < ctx->super.ecb.rounds; ++i) |
| 2096 | bits5 = _mm256_aesenc_epi128(bits5, ctx->super.ecb.keys.m256[i]); |
| 2097 | bits5 = _mm256_aesenclast_epi128(bits5, ctx->super.ecb.keys.m256[ctx->super.ecb.rounds]); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2098 | } |
| 2099 | |
| 2100 | /* append tag to encbuf */ |
| 2101 | _mm_storeu_si128((void *)encp, |
| 2102 | gfmul_get_tag256(&gstate, _mm256_castsi256_si128(_mm256_permute2f128_si256(bits5, bits5, 0x11)))); |
| 2103 | encp += 16; |
| 2104 | |
| 2105 | /* write remaining bytes */ |
| 2106 | write_remaining_bytes(output, encbuf, encp); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2107 | } |
| 2108 | |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 2109 | static int non_temporal_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2110 | { |
| 2111 | struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx; |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 2112 | int aesni256 = is_enc && ptls_fusion_can_aesni256; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2113 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2114 | ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE); |
| 2115 | ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2116 | if (key == NULL) |
| 2117 | return 0; |
| 2118 | |
| 2119 | ctx->super.dispose_crypto = aesgcm_dispose_crypto; |
Kazuho Oku | bb2cac2 | 2023-02-13 16:09:08 +0900 | [diff] [blame] | 2120 | ctx->super.do_get_iv = aesgcm_get_iv; |
| 2121 | ctx->super.do_set_iv = aesgcm_set_iv; |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 2122 | ctx->super.do_encrypt_init = NULL; |
| 2123 | ctx->super.do_encrypt_update = NULL; |
| 2124 | ctx->super.do_encrypt_final = NULL; |
| 2125 | if (is_enc) { |
| 2126 | ctx->super.do_encrypt = ptls_aead__do_encrypt; |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 2127 | ctx->super.do_encrypt_v = aesni256 ? non_temporal_encrypt_v256 : non_temporal_encrypt_v128; |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 2128 | ctx->super.do_decrypt = NULL; |
| 2129 | } else { |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 2130 | assert(!aesni256); |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 2131 | ctx->super.do_encrypt = NULL; |
| 2132 | ctx->super.do_encrypt_v = NULL; |
| 2133 | ctx->super.do_decrypt = non_temporal_decrypt128; |
| 2134 | } |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2135 | |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 2136 | ctx->aesgcm = |
| 2137 | new_aesgcm(key, key_size, |
| 2138 | 7 * (ptls_fusion_can_aesni256 ? 32 : 16), // 6 blocks at once, plus len(A) | len(C) that we might append |
| 2139 | aesni256); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2140 | |
| 2141 | return 0; |
| 2142 | } |
| 2143 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 2144 | static int non_temporal_aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2145 | { |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 2146 | return non_temporal_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2147 | } |
| 2148 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 2149 | static int non_temporal_aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv) |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2150 | { |
Kazuho Oku | 2094f78 | 2022-05-10 22:13:00 +0900 | [diff] [blame] | 2151 | return non_temporal_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE); |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2152 | } |
| 2153 | |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 2154 | ptls_aead_algorithm_t ptls_non_temporal_aes128gcm = {"AES128-GCM", |
| 2155 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 2156 | PTLS_AESGCM_INTEGRITY_LIMIT, |
| 2157 | &ptls_fusion_aes128ctr, |
| 2158 | NULL, // &ptls_fusion_aes128ecb, |
| 2159 | PTLS_AES128_KEY_SIZE, |
| 2160 | PTLS_AESGCM_IV_SIZE, |
| 2161 | PTLS_AESGCM_TAG_SIZE, |
Kazuho Oku | 93944ce | 2022-07-06 16:41:08 +0900 | [diff] [blame] | 2162 | {PTLS_TLS12_AESGCM_FIXED_IV_SIZE, PTLS_TLS12_AESGCM_RECORD_IV_SIZE}, |
Kazuho Oku | 9dc6982 | 2022-06-08 13:25:52 +0900 | [diff] [blame] | 2163 | 1, |
Kazuho Oku | 1edf707 | 2022-07-11 10:30:43 +0900 | [diff] [blame] | 2164 | PTLS_X86_CACHE_LINE_ALIGN_BITS, |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 2165 | sizeof(struct aesgcm_context), |
| 2166 | non_temporal_aes128gcm_setup}; |
| 2167 | ptls_aead_algorithm_t ptls_non_temporal_aes256gcm = {"AES256-GCM", |
| 2168 | PTLS_AESGCM_CONFIDENTIALITY_LIMIT, |
| 2169 | PTLS_AESGCM_INTEGRITY_LIMIT, |
| 2170 | &ptls_fusion_aes256ctr, |
| 2171 | NULL, // &ptls_fusion_aes128ecb, |
| 2172 | PTLS_AES256_KEY_SIZE, |
| 2173 | PTLS_AESGCM_IV_SIZE, |
| 2174 | PTLS_AESGCM_TAG_SIZE, |
Kazuho Oku | 93944ce | 2022-07-06 16:41:08 +0900 | [diff] [blame] | 2175 | {PTLS_TLS12_AESGCM_FIXED_IV_SIZE, PTLS_TLS12_AESGCM_RECORD_IV_SIZE}, |
Kazuho Oku | 9dc6982 | 2022-06-08 13:25:52 +0900 | [diff] [blame] | 2176 | 1, |
Kazuho Oku | 1edf707 | 2022-07-11 10:30:43 +0900 | [diff] [blame] | 2177 | PTLS_X86_CACHE_LINE_ALIGN_BITS, |
Kazuho Oku | 34e9b2d | 2022-05-09 17:04:02 +0900 | [diff] [blame] | 2178 | sizeof(struct aesgcm_context), |
| 2179 | non_temporal_aes256gcm_setup}; |
Kazuho Oku | ed661b1 | 2022-04-30 21:43:31 +0900 | [diff] [blame] | 2180 | |
Christian Huitema | 3c3e3f2 | 2020-06-23 15:49:15 -0700 | [diff] [blame] | 2181 | #ifdef _WINDOWS |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 2182 | /** |
| 2183 | * ptls_fusion_is_supported_by_cpu: |
| 2184 | * Check that the CPU has extended instructions for PCMUL, AES and AVX2. |
| 2185 | * This test assumes that the CPU is following the x86/x64 architecture. |
| 2186 | * A slightly more refined test could check that the cpu_info spells out |
| 2187 | * "genuineIntel" or "authenticAMD", but would fail in presence of |
| 2188 | * little known CPU brands or some VM */ |
Christian Huitema | c17ef18 | 2020-06-22 20:41:45 -0700 | [diff] [blame] | 2189 | int ptls_fusion_is_supported_by_cpu(void) |
| 2190 | { |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 2191 | uint32_t cpu_info[4]; |
| 2192 | uint32_t nb_ids; |
| 2193 | int is_supported = 0; |
| 2194 | |
| 2195 | __cpuid(cpu_info, 0); |
| 2196 | nb_ids = cpu_info[0]; |
| 2197 | |
| 2198 | if (nb_ids >= 7) { |
| 2199 | uint32_t leaf1_ecx; |
| 2200 | __cpuid(cpu_info, 1); |
| 2201 | leaf1_ecx = cpu_info[2]; |
Kazuho Oku | 14c00c0 | 2020-09-12 20:48:25 +0900 | [diff] [blame] | 2202 | |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 2203 | if (/* PCLMUL */ (leaf1_ecx & (1 << 5)) != 0 && /* AES */ (leaf1_ecx & (1 << 25)) != 0) { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2204 | uint32_t leaf7_ebx, leaf7_ecx; |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 2205 | __cpuid(cpu_info, 7); |
| 2206 | leaf7_ebx = cpu_info[1]; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2207 | leaf7_ecx = cpu_info[2]; |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 2208 | |
| 2209 | is_supported = /* AVX2 */ (leaf7_ebx & (1 << 5)) != 0; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2210 | |
| 2211 | /* enable 256-bit mode if possible */ |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 2212 | if (is_supported && (leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_aesni256) |
| 2213 | ptls_fusion_can_aesni256 = 1; |
Christian Huitema | 8160543 | 2020-06-23 15:38:36 -0700 | [diff] [blame] | 2214 | } |
| 2215 | } |
| 2216 | |
| 2217 | return is_supported; |
Christian Huitema | c17ef18 | 2020-06-22 20:41:45 -0700 | [diff] [blame] | 2218 | } |
| 2219 | #else |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 2220 | int ptls_fusion_is_supported_by_cpu(void) |
| 2221 | { |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2222 | unsigned leaf1_ecx, leaf7_ebx, leaf7_ecx; |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 2223 | |
| 2224 | { /* GCC-specific code to obtain CPU features */ |
Kazuho Oku | efce043 | 2020-05-15 04:51:58 +0900 | [diff] [blame] | 2225 | unsigned leaf_cnt; |
| 2226 | __asm__("cpuid" : "=a"(leaf_cnt) : "a"(0) : "ebx", "ecx", "edx"); |
| 2227 | if (leaf_cnt < 7) |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 2228 | return 0; |
Kazuho Oku | efce043 | 2020-05-15 04:51:58 +0900 | [diff] [blame] | 2229 | __asm__("cpuid" : "=c"(leaf1_ecx) : "a"(1) : "ebx", "edx"); |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2230 | __asm__("cpuid" : "=b"(leaf7_ebx), "=c"(leaf7_ecx) : "a"(7), "c"(0) : "edx"); |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 2231 | } |
| 2232 | |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 2233 | /* AVX2 */ |
| 2234 | if ((leaf7_ebx & (1 << 5)) == 0) |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 2235 | return 0; |
Kazuho Oku | 3604f8b | 2020-05-15 04:24:27 +0900 | [diff] [blame] | 2236 | /* AES */ |
| 2237 | if ((leaf1_ecx & (1 << 25)) == 0) |
| 2238 | return 0; |
| 2239 | /* PCLMUL */ |
| 2240 | if ((leaf1_ecx & (1 << 1)) == 0) |
| 2241 | return 0; |
| 2242 | |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2243 | /* enable 256-bit mode if possible */ |
Kazuho Oku | 65d3e79 | 2022-06-29 12:04:02 +0900 | [diff] [blame] | 2244 | if ((leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_aesni256) |
| 2245 | ptls_fusion_can_aesni256 = 1; |
Kazuho Oku | 680ce18 | 2022-05-09 15:53:18 +0900 | [diff] [blame] | 2246 | |
Kazuho Oku | 3ee790b | 2020-05-15 03:35:03 +0900 | [diff] [blame] | 2247 | return 1; |
| 2248 | } |
Christian Huitema | c17ef18 | 2020-06-22 20:41:45 -0700 | [diff] [blame] | 2249 | #endif |