extract "unsafe" logic
diff --git a/lib/fusion.c b/lib/fusion.c
index 0d04772..527becb 100644
--- a/lib/fusion.c
+++ b/lib/fusion.c
@@ -1265,6 +1265,21 @@
}
}
+NO_SANITIZE_ADDRESS
+static inline uint8_t *load_preceding_unaligned(uint8_t *encbuf, uint8_t **output)
+{
+ uint8_t *encp;
+
+ if ((encp = encbuf + ((uintptr_t)*output & 63)) != encbuf) {
+ _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(*output - (encp - encbuf))));
+ _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(*output - (encp - encbuf) + 32)));
+ *output -= encp - encbuf;
+ }
+
+ return encp;
+}
+
+NO_SANITIZE_ADDRESS
static inline void write_remaining_bytes(uint8_t *dst, const uint8_t *src, const uint8_t *end)
{
/* Write in 64-byte chunks, using NT store instructions. Last partial block, if any, is written to cache, as that cache line
@@ -1283,7 +1298,6 @@
}
}
-NO_SANITIZE_ADDRESS
static void non_temporal_encrypt_v128(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt,
uint64_t seq, const void *aad, size_t aadlen)
{
@@ -1361,12 +1375,9 @@
* append to the ciphertext before writing the bytes to main memory using NT store instructions. */
PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 16 + 16);
- /* determine `num_bytes_write_delayed` as well as initializing `encbuf`, adjusting `output` */
- if ((encp = encbuf + ((uintptr_t)output & 63)) != encbuf) {
- _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(output - (encp - encbuf))));
- _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(output - (encp - encbuf) + 32)));
- output -= encp - encbuf;
- }
+ /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
+ encp = load_preceding_unaligned(encbuf, &output);
+
/* First write would be 128 bytes (32+6*16), if encbuf contains no less than 32 bytes already. */
if (encp - encbuf >= 32)
state |= STATE_COPY_128B;
@@ -1633,12 +1644,8 @@
* append to the ciphertext before writing the bytes to main memory using NT store instructions. */
PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 32 + 16);
- /* determine `num_bytes_write_delayed` as well as initializing `encbuf`, adjusting `output` */
- if ((encp = encbuf + ((uintptr_t)output & 63)) != encbuf) {
- _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(output - (encp - encbuf))));
- _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(output - (encp - encbuf) + 32)));
- output -= encp - encbuf;
- }
+ /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
+ encp = load_preceding_unaligned(encbuf, &output);
/* setup ctr, retaining Ek(0), len(A) | len(C) to be fed into GCM */
__m256i ctr = _mm256_broadcastsi128_si256(calc_counter(agctx, seq));