Add fast multiply asm for AVR (#50)
diff --git a/asm_arm.inc b/asm_arm.inc
index cd235b8..5f07264 100644
--- a/asm_arm.inc
+++ b/asm_arm.inc
@@ -3,8 +3,6 @@
 #ifndef _UECC_ASM_ARM_H_
 #define _UECC_ASM_ARM_H_
 
-#include "asm_arm_mult_square.inc"
-
 #if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
     #define uECC_MIN_WORDS 8
 #endif
@@ -158,6 +156,8 @@
 
 #if (uECC_OPTIMIZATION_LEVEL >= 3)
 
+#include "asm_arm_mult_square.inc"
+
 #define FAST_MULT_ASM_5_TO_6                 \
     "cmp r3, #5 \n\t"                        \
     "beq 1f \n\t"                            \
diff --git a/asm_avr.inc b/asm_avr.inc
index c99bf82..0d4582f 100644
--- a/asm_avr.inc
+++ b/asm_avr.inc
@@ -3,6 +3,22 @@
 #ifndef _UECC_ASM_AVR_H_
 #define _UECC_ASM_AVR_H_
 
+#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
+    #define uECC_MIN_WORDS 32
+#endif
+#if uECC_SUPPORTS_secp224r1
+    #undef uECC_MIN_WORDS
+    #define uECC_MIN_WORDS 28
+#endif
+#if uECC_SUPPORTS_secp192r1
+    #undef uECC_MIN_WORDS
+    #define uECC_MIN_WORDS 24
+#endif
+#if uECC_SUPPORTS_secp160r1
+    #undef uECC_MIN_WORDS
+    #define uECC_MIN_WORDS 20
+#endif
+
 #if __AVR_HAVE_EIJMP_EICALL__
     #define IJMP "eijmp \n\t"
 #else
@@ -189,6 +205,64 @@
 }
 #define asm_sub 1
 
+#if (uECC_OPTIMIZATION_LEVEL >= 3)
+
+#include "asm_avr_mult_square.inc"
+
+__attribute((noinline))
+uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
+                                const uECC_word_t *left,
+                                const uECC_word_t *right,
+                                wordcount_t num_words) {
+    /* num_words should already be in r18. */
+    register wordcount_t r18 __asm__("r18") = num_words;
+    
+    __asm__ volatile (
+        "push r18 \n\t"
+#if (uECC_MIN_WORDS == 20)
+        FAST_MULT_ASM_20
+        "pop r18 \n\t"
+    #if (uECC_MAX_WORDS > 20)
+        FAST_MULT_ASM_20_TO_24
+    #endif
+    #if (uECC_MAX_WORDS > 24)
+        FAST_MULT_ASM_24_TO_28
+    #endif
+    #if (uECC_MAX_WORDS > 28)
+        FAST_MULT_ASM_28_TO_32
+    #endif
+#elif (uECC_MIN_WORDS == 24)
+        FAST_MULT_ASM_24
+        "pop r18 \n\t"
+    #if (uECC_MAX_WORDS > 24)
+        FAST_MULT_ASM_24_TO_28
+    #endif
+    #if (uECC_MAX_WORDS > 28)
+        FAST_MULT_ASM_28_TO_32
+    #endif
+#elif (uECC_MIN_WORDS == 28)
+        FAST_MULT_ASM_28
+        "pop r18 \n\t"
+    #if (uECC_MAX_WORDS > 28)
+        FAST_MULT_ASM_28_TO_32
+    #endif
+#elif (uECC_MIN_WORDS == 32)
+        FAST_MULT_ASM_32
+        "pop r18 \n\t"
+#endif
+        "done: \n\t"
+        "eor r1, r1 \n\t"
+        : "+x" (left), "+y" (right), "+z" (result)
+        : "r" (r18)
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+          "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
+          "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+    );
+}
+#define asm_mult 1
+
+#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
+
 #if uECC_SUPPORTS_secp160r1
 static const struct uECC_Curve_t curve_secp160r1;
 static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
@@ -704,6 +778,8 @@
 
 #endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
 
+/* ---- "Small" implementations ---- */
+
 #if !asm_add
 uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
                                       const uECC_word_t *left,
diff --git a/asm_avr_mult_square.inc b/asm_avr_mult_square.inc
index 5581bb4..687c618 100644
--- a/asm_avr_mult_square.inc
+++ b/asm_avr_mult_square.inc
@@ -3,7 +3,7 @@
 #ifndef _UECC_ASM_AVR_MULT_SQUARE_H_
 #define _UECC_ASM_AVR_MULT_SQUARE_H_
 
-#define FAST_MULT_ASM_5    \
+#define FAST_MULT_ASM_20   \
     "adiw r30, 10 \n\t"    \
     "adiw r28, 10 \n\t"    \
     "ld r2, x+ \n\t"       \
@@ -1905,10 +1905,911 @@
     "add r23, r0 \n\t"     \
     "adc r24, r1 \n\t"     \
     "st z+, r23 \n\t"      \
-    "st z+, r24 \n\t"      \
-    "eor r1, r1 \n\t"
+    "st z+, r24 \n\t"
 
-#define FAST_MULT_ASM_6        \
+#define FAST_MULT_ASM_20_TO_24     \
+    "cpi r18, 20 \n\t"             \
+    "brne 1f \n\t"                 \
+    "jmp done \n\t"                \
+    "1: \n\t"                      \
+    "ld r2, x+ \n\t"               \
+    "ld r6, y+ \n\t"               \
+    "ld r3, x+ \n\t"               \
+    "ld r7, y+ \n\t"               \
+    "ld r4, x+ \n\t"               \
+    "ld r8, y+ \n\t"               \
+    "ld r5, x+ \n\t"               \
+    "ld r9, y+ \n\t"               \
+    "sbiw r26, 24 \n\t"            \
+    "sbiw r28, 24 \n\t"            \
+    "sbiw r30, 20 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+                                   \
+    "mul r2, r14 \n\t"             \
+    "mov r19, r0 \n\t"             \
+    "mov r20, r1 \n\t"             \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r2, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r2, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r2, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "mul r11, r9 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r12, r8 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r13, r7 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r2, r6 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "mul r12, r9 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r13, r8 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r2, r7 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r6 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "mul r13, r9 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r2, r8 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r7 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r6 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "mul r2, r9 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r8 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r7 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r6 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "mul r3, r9 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r8 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r7 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "mul r4, r9 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r8 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "mul r5, r9 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "st z+, r21 \n\t"              \
+    "st z+, r19 \n\t"              \
+    "adiw r26, 4 \n\t"             \
+    "adiw r28, 4 \n\t"
+
+#define FAST_MULT_ASM_24       \
     "adiw r30, 20 \n\t"        \
     "adiw r28, 20 \n\t"        \
     "ld r2, x+ \n\t"           \
@@ -4677,10 +5578,1071 @@
     "add r22, r0 \n\t"         \
     "adc r23, r1 \n\t"         \
     "st z+, r22 \n\t"          \
-    "st z+, r23 \n\t"          \
-    "eor r1, r1 \n\t"
+    "st z+, r23 \n\t"
 
-#define FAST_MULT_ASM_7    \
+#define FAST_MULT_ASM_24_TO_28     \
+    "cpi r18, 24 \n\t"             \
+    "brne 1f \n\t"                 \
+    "jmp done \n\t"                \
+    "1: \n\t"                      \
+    "ld r2, x+ \n\t"               \
+    "ld r6, y+ \n\t"               \
+    "ld r3, x+ \n\t"               \
+    "ld r7, y+ \n\t"               \
+    "ld r4, x+ \n\t"               \
+    "ld r8, y+ \n\t"               \
+    "ld r5, x+ \n\t"               \
+    "ld r9, y+ \n\t"               \
+    "sbiw r26, 28 \n\t"            \
+    "sbiw r28, 28 \n\t"            \
+    "sbiw r30, 24 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+                                   \
+    "mul r2, r14 \n\t"             \
+    "mov r19, r0 \n\t"             \
+    "mov r20, r1 \n\t"             \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r2, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r2, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r2, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "mul r11, r9 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r12, r8 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r13, r7 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r2, r6 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "mul r12, r9 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r13, r8 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r2, r7 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r6 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "mul r13, r9 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r2, r8 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r7 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r6 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "mul r2, r9 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r8 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r7 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r6 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "mul r3, r9 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r8 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r7 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "mul r4, r9 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r8 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "mul r5, r9 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "st z+, r19 \n\t"              \
+    "st z+, r20 \n\t"              \
+    "adiw r26, 4 \n\t"             \
+    "adiw r28, 4 \n\t"
+
+#define FAST_MULT_ASM_28   \
     "adiw r30, 20 \n\t"    \
     "adiw r28, 20 \n\t"    \
     "ld r2, x+ \n\t"       \
@@ -8437,10 +10399,1230 @@
     "add r24, r0 \n\t"     \
     "adc r22, r1 \n\t"     \
     "st z+, r24 \n\t"      \
-    "st z+, r22 \n\t"      \
-    "eor r1, r1 \n\t"
+    "st z+, r22 \n\t"
 
-#define FAST_MULT_ASM_8        \
+#define FAST_MULT_ASM_28_TO_32     \
+    "cpi r18, 28 \n\t"             \
+    "brne 1f \n\t"                 \
+    "jmp done \n\t"                \
+    "1: \n\t"                      \
+    "ld r2, x+ \n\t"               \
+    "ld r6, y+ \n\t"               \
+    "ld r3, x+ \n\t"               \
+    "ld r7, y+ \n\t"               \
+    "ld r4, x+ \n\t"               \
+    "ld r8, y+ \n\t"               \
+    "ld r5, x+ \n\t"               \
+    "ld r9, y+ \n\t"               \
+    "sbiw r26, 32 \n\t"            \
+    "sbiw r28, 32 \n\t"            \
+    "sbiw r30, 28 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+                                   \
+    "mul r2, r14 \n\t"             \
+    "mov r19, r0 \n\t"             \
+    "mov r20, r1 \n\t"             \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r2, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r2, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r2, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r10, x+ \n\t"              \
+    "ld r14, y+ \n\t"              \
+    "mul r2, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r20, r0 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "ld r11, x+ \n\t"              \
+    "ld r15, y+ \n\t"              \
+    "mul r2, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r6, r11 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r14 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r7, r10 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r8, r13 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r9, r12 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r21, r0 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "ld r12, x+ \n\t"              \
+    "ld r16, y+ \n\t"              \
+    "mul r2, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r6, r12 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r15 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r7, r11 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r14 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r8, r10 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r9, r13 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "ld r0, z \n\t"                \
+    "add r19, r0 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "ld r13, x+ \n\t"              \
+    "ld r17, y+ \n\t"              \
+    "mul r2, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r6, r13 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r16 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r7, r12 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r15 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r8, r11 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r14 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r9, r10 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "mul r11, r9 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r12, r8 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r13, r7 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r2, r6 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r17 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r16 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r15 \n\t"             \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "mul r12, r9 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r13, r8 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r2, r7 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r3, r6 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r17 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r16 \n\t"             \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "mul r13, r9 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r2, r8 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r3, r7 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r4, r6 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r17 \n\t"             \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "ldi r19, 0 \n\t"              \
+    "mul r2, r9 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r3, r8 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r4, r7 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "mul r5, r6 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "adc r19, r25 \n\t"            \
+    "st z+, r20 \n\t"              \
+                                   \
+    "ldi r20, 0 \n\t"              \
+    "mul r3, r9 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r4, r8 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "mul r5, r7 \n\t"              \
+    "add r21, r0 \n\t"             \
+    "adc r19, r1 \n\t"             \
+    "adc r20, r25 \n\t"            \
+    "st z+, r21 \n\t"              \
+                                   \
+    "ldi r21, 0 \n\t"              \
+    "mul r4, r9 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "mul r5, r8 \n\t"              \
+    "add r19, r0 \n\t"             \
+    "adc r20, r1 \n\t"             \
+    "adc r21, r25 \n\t"            \
+    "st z+, r19 \n\t"              \
+                                   \
+    "mul r5, r9 \n\t"              \
+    "add r20, r0 \n\t"             \
+    "adc r21, r1 \n\t"             \
+    "st z+, r20 \n\t"              \
+    "st z+, r21 \n\t"
+    /* Not necessary to move ptrs since we don't support sizes > 32 */
+
+#define FAST_MULT_ASM_32       \
     "adiw r30, 30 \n\t"        \
     "adiw r28, 30 \n\t"        \
     "ld r2, x+ \n\t"           \
@@ -13352,10 +16534,9 @@
     "add r23, r0 \n\t"         \
     "adc r24, r1 \n\t"         \
     "st z+, r23 \n\t"          \
-    "st z+, r24 \n\t"          \
-    "eor r1, r1 \n\t"
+    "st z+, r24 \n\t"
 
-#define FAST_SQUARE_ASM_5          \
+#define FAST_SQUARE_ASM_20         \
     "ld r2, x+ \n\t"               \
     "ld r3, x+ \n\t"               \
     "ld r4, x+ \n\t"               \
@@ -14510,7 +17691,7 @@
     "st z+, r25 \n\t"              \
     "eor r1, r1 \n\t"
 
-#define FAST_SQUARE_ASM_6              \
+#define FAST_SQUARE_ASM_24             \
     "ldi r25, 0 \n\t"                  \
     "movw r28, r26 \n\t"               \
     "ld r2, x+ \n\t"                   \
@@ -16157,7 +19338,7 @@
     "st z+, r28 \n\t"                  \
     "eor r1, r1 \n\t"
 
-#define FAST_SQUARE_ASM_7          \
+#define FAST_SQUARE_ASM_28         \
     "ldi r25, 0 \n\t"              \
     "movw r28, r26 \n\t"           \
     "ld r2, x+ \n\t"               \
@@ -18360,7 +21541,7 @@
     "st z+, r28 \n\t"              \
     "eor r1, r1 \n\t"
 
-#define FAST_SQUARE_ASM_8              \
+#define FAST_SQUARE_ASM_32             \
     "ldi r25, 0 \n\t"                  \
     "movw r28, r26 \n\t"               \
     "ld r2, x+ \n\t"                   \
diff --git a/scripts/mult_avr_extra.py b/scripts/mult_avr_extra.py
new file mode 100755
index 0000000..f6e654f
--- /dev/null
+++ b/scripts/mult_avr_extra.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+import sys
+
+if len(sys.argv) < 2:
+    print "Provide the integer size in bytes"
+    sys.exit(1)
+
+size = int(sys.argv[1])
+
+def lhi(i):
+    return i + 2
+
+def rhi(i):
+    return i + 6
+
+left_lo = [10, 11, 12, 13]
+right_lo = [14, 15, 16, 17]
+
+def llo(i):
+    return left_lo[i]
+
+def rlo(i):
+    return right_lo[i]
+
+def emit(line, *args):
+    s = '"' + line + r' \n\t"'
+    print s % args
+
+def update_low():
+    global left_lo
+    global right_lo
+    left_lo = left_lo[1:] + left_lo[:1]
+    right_lo = right_lo[1:] + right_lo[:1]
+    emit("ld r%s, x+", left_lo[3])
+    emit("ld r%s, y+", right_lo[3])
+
+accum = [19, 20, 21]
+
+def acc(i):
+    return accum[i]
+
+def rotate_acc():
+    global accum
+    accum = accum[1:] + accum[:1]
+
+# Load high values
+for i in xrange(4):
+    emit("ld r%s, x+", lhi(i))
+    emit("ld r%s, y+", rhi(i))
+
+emit("sbiw r26, %s", size + 4)
+emit("sbiw r28, %s", size + 4)
+emit("sbiw r30, %s", size)
+
+# Load low values
+for i in xrange(4):
+    emit("ld r%s, x+", llo(i))
+    emit("ld r%s, y+", rlo(i))
+print ""
+
+# Compute initial triangles
+emit("mul r%s, r%s", lhi(0), rlo(0))
+emit("mov r%s, r0", acc(0))
+emit("mov r%s, r1", acc(1))
+emit("ldi r%s, 0", acc(2))
+emit("ld r0, z")
+emit("add r%s, r0", acc(0))
+emit("adc r%s, r25", acc(1))
+emit("mul r%s, r%s", rhi(0), llo(0))
+emit("add r%s, r0", acc(0))
+emit("adc r%s, r1", acc(1))
+emit("adc r%s, r25", acc(2))
+emit("st z+, r%s", acc(0))
+print ""
+rotate_acc()
+
+for i in xrange(1, 4):
+    emit("ldi r%s, 0", acc(2))
+    emit("ld r0, z")
+    emit("add r%s, r0", acc(0))
+    emit("adc r%s, r25", acc(1))
+    for j in xrange(i + 1):
+        emit("mul r%s, r%s", lhi(j), rlo(i-j))
+        emit("add r%s, r0", acc(0))
+        emit("adc r%s, r1", acc(1))
+        emit("adc r%s, r25", acc(2))
+        emit("mul r%s, r%s", rhi(j), llo(i-j))
+        emit("add r%s, r0", acc(0))
+        emit("adc r%s, r1", acc(1))
+        emit("adc r%s, r25", acc(2))
+    emit("st z+, r%s", acc(0))
+    print ""
+    rotate_acc()
+
+# Compute rows overlapping old block
+for i in xrange(4, size):
+    emit("ldi r%s, 0", acc(2))
+    emit("ld r0, z")
+    emit("add r%s, r0", acc(0))
+    emit("adc r%s, r25", acc(1))
+    update_low()
+    for j in xrange(4):
+        emit("mul r%s, r%s", lhi(j), rlo(3-j))
+        emit("add r%s, r0", acc(0))
+        emit("adc r%s, r1", acc(1))
+        emit("adc r%s, r25", acc(2))
+        emit("mul r%s, r%s", rhi(j), llo(3-j))
+        emit("add r%s, r0", acc(0))
+        emit("adc r%s, r1", acc(1))
+        emit("adc r%s, r25", acc(2))
+    emit("st z+, r%s", acc(0))
+    print ""
+    rotate_acc()
+
+# Compute new triangle
+left_combined = [llo(1), llo(2), llo(3), lhi(0), lhi(1), lhi(2), lhi(3)]
+right_combined = [rlo(1), rlo(2), rlo(3), rhi(0), rhi(1), rhi(2), rhi(3)]
+
+def left(i):
+    return left_combined[i]
+
+def right(i):
+    return right_combined[i]
+
+for i in xrange(6):
+    emit("ldi r%s, 0", acc(2))
+    for j in xrange(7 - i):
+        emit("mul r%s, r%s", left(i+j), right(6-j))
+        emit("add r%s, r0", acc(0))
+        emit("adc r%s, r1", acc(1))
+        emit("adc r%s, r25", acc(2))
+    emit("st z+, r%s", acc(0))
+    print ""
+    rotate_acc()
+
+emit("mul r%s, r%s", left(6), right(6))
+emit("add r%s, r0", acc(0))
+emit("adc r%s, r1", acc(1))
+emit("st z+, r%s", acc(0))
+emit("st z+, r%s", acc(1))
+emit("adiw r26, 4")
+emit("adiw r28, 4")