unroll hot loops
diff --git a/lib/fusion.c b/lib/fusion.c
index 5f5b527..39a64c8 100644
--- a/lib/fusion.c
+++ b/lib/fusion.c
@@ -353,11 +353,17 @@
 
 /* setup the counters (we can always run in full), but use the last slot for calculating ek0, if possible */
 #define SETUP_BITS()                                                                                                               \
-    do {                                                                                                                           \
-        for (int i = 0; i < 5; ++i) {                                                                                              \
-            ctr = _mm_add_epi64(ctr, one64);                                                                                       \
-            bits[i] = _mm_shuffle_epi8(ctr, bswap64);                                                                              \
-        }                                                                                                                          \
+    do { \
+        ctr = _mm_add_epi64(ctr, one64);                                                                                       \
+        bits[0] = _mm_shuffle_epi8(ctr, bswap64);                                                                              \
+        ctr = _mm_add_epi64(ctr, one64);                                                                                       \
+        bits[1] = _mm_shuffle_epi8(ctr, bswap64);                                                                              \
+        ctr = _mm_add_epi64(ctr, one64);                                                                                       \
+        bits[2] = _mm_shuffle_epi8(ctr, bswap64);                                                                              \
+        ctr = _mm_add_epi64(ctr, one64);                                                                                       \
+        bits[3] = _mm_shuffle_epi8(ctr, bswap64);                                                                              \
+        ctr = _mm_add_epi64(ctr, one64);                                                                                       \
+        bits[4] = _mm_shuffle_epi8(ctr, bswap64);                                                                              \
         if (PTLS_LIKELY(srclen > 16 * 5)) {                                                                                        \
             ctr = _mm_add_epi64(ctr, one64);                                                                                       \
             bits[5] = _mm_shuffle_epi8(ctr, bswap64);                                                                              \
@@ -375,8 +381,16 @@
     /* the main loop */
     while (PTLS_LIKELY(srclen >= 6 * 16)) {
         /* apply the bits */
-        for (int i = 0; i < 6; ++i)
-            _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits[i]));
+#define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits[i]))
+        APPLY(0);
+        APPLY(1);
+        APPLY(2);
+        APPLY(3);
+        APPLY(4);
+        APPLY(5);
+#undef APPLY
+        dst += 6;
+        src += 6;
         srclen -= 6 * 16;
 
         /* setup bits */