Add fast square asm for AVR (#50)
diff --git a/asm_avr.inc b/asm_avr.inc
index b6c4a41..c988040 100644
--- a/asm_avr.inc
+++ b/asm_avr.inc
@@ -250,17 +250,70 @@
         FAST_MULT_ASM_32
         "pop r18 \n\t"
 #endif
-        "done: \n\t"
+        "2: \n\t"
         "eor r1, r1 \n\t"
         : "+x" (left), "+y" (right), "+z" (result)
         : "r" (r18)
         : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
           "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
-          "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+          "r21", "r22", "r23", "r24", "r25", "cc"
     );
 }
 #define asm_mult 1
 
+#if uECC_SQUARE_FUNC
+__attribute((noinline))
+uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
+                                  const uECC_word_t *left,
+                                  wordcount_t num_words) {
+    /* num_words should already be in r20. */
+    register wordcount_t r20 __asm__("r20") = num_words;
+    
+    __asm__ volatile (
+        "push r20 \n\t"
+#if (uECC_MIN_WORDS == 20)
+        FAST_SQUARE_ASM_20
+        "pop r20 \n\t"
+    #if (uECC_MAX_WORDS > 20)
+        FAST_SQUARE_ASM_20_TO_24
+    #endif
+    #if (uECC_MAX_WORDS > 24)
+        FAST_SQUARE_ASM_24_TO_28
+    #endif
+    #if (uECC_MAX_WORDS > 28)
+        FAST_SQUARE_ASM_28_TO_32
+    #endif
+#elif (uECC_MIN_WORDS == 24)
+        FAST_SQUARE_ASM_24
+        "pop r20 \n\t"
+    #if (uECC_MAX_WORDS > 24)
+        FAST_SQUARE_ASM_24_TO_28
+    #endif
+    #if (uECC_MAX_WORDS > 28)
+        FAST_SQUARE_ASM_28_TO_32
+    #endif
+#elif (uECC_MIN_WORDS == 28)
+        FAST_SQUARE_ASM_28
+        "pop r20 \n\t"
+    #if (uECC_MAX_WORDS > 28)
+        FAST_SQUARE_ASM_28_TO_32
+    #endif
+#elif (uECC_MIN_WORDS == 32)
+        FAST_SQUARE_ASM_32
+        "pop r20 \n\t"
+#endif
+        "2: \n\t"
+        "eor r1, r1 \n\t"
+        : "+x" (left), "+z" (result)
+        : "r" (r20)
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+          "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19",
+          "r21", "r22", "r23", "r24", "r25", "r28", "r29", "cc"
+    );
+}
+#define asm_square 1
+#endif /* uECC_SQUARE_FUNC */
+
 #endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
 
 #if uECC_SUPPORTS_secp160r1
@@ -434,7 +487,7 @@
         
         : "+x" (product), [carry] "+r" (carry)
         : "y" (result)
-        : "r0", "r18", "r19", "r30", "r31", "cc", "memory"
+        : "r0", "r18", "r19", "r30", "r31", "cc"
     );
 
     if (carry > 0) {
@@ -759,7 +812,7 @@
         
         : "+x" (product), [carry] "+r" (carry)
         : "y" (result)
-        : "r0", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r30", "r31", "cc", "memory"
+        : "r0", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r30", "r31", "cc"
     );
     
     if (carry > 0) {
diff --git a/asm_avr_mult_square.inc b/asm_avr_mult_square.inc
index 687c618..7ae08bc 100644
--- a/asm_avr_mult_square.inc
+++ b/asm_avr_mult_square.inc
@@ -1910,7 +1910,7 @@
 #define FAST_MULT_ASM_20_TO_24     \
     "cpi r18, 20 \n\t"             \
     "brne 1f \n\t"                 \
-    "jmp done \n\t"                \
+    "jmp 2f \n\t"                  \
     "1: \n\t"                      \
     "ld r2, x+ \n\t"               \
     "ld r6, y+ \n\t"               \
@@ -5583,7 +5583,7 @@
 #define FAST_MULT_ASM_24_TO_28     \
     "cpi r18, 24 \n\t"             \
     "brne 1f \n\t"                 \
-    "jmp done \n\t"                \
+    "jmp 2f \n\t"                  \
     "1: \n\t"                      \
     "ld r2, x+ \n\t"               \
     "ld r6, y+ \n\t"               \
@@ -10404,7 +10404,7 @@
 #define FAST_MULT_ASM_28_TO_32     \
     "cpi r18, 28 \n\t"             \
     "brne 1f \n\t"                 \
-    "jmp done \n\t"                \
+    "jmp 2f \n\t"                  \
     "1: \n\t"                      \
     "ld r2, x+ \n\t"               \
     "ld r6, y+ \n\t"               \
@@ -16557,7 +16557,9 @@
     "ld r19, x+ \n\t"              \
     "ld r20, x+ \n\t"              \
     "ld r21, x+ \n\t"              \
-    "ldi r27, 0 \n\t"              \
+    "push r26 \n\t"                \
+    "push r27 \n\t"                \
+    "ldi r25, 0 \n\t"              \
                                    \
     "ldi r23, 0 \n\t"              \
     "mul r2, r2 \n\t"              \
@@ -16568,40 +16570,40 @@
     "mul r2, r3 \n\t"              \
     "lsl r0 \n\t"                  \
     "rol r1 \n\t"                  \
-    "adc r24, r27 \n\t"            \
+    "adc r24, r25 \n\t"            \
     "add r22, r0 \n\t"             \
     "adc r23, r1 \n\t"             \
-    "adc r24, r27 \n\t"            \
+    "adc r24, r25 \n\t"            \
     "st z+, r22 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
     "mul r2, r4 \n\t"              \
     "lsl r0 \n\t"                  \
     "rol r1 \n\t"                  \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r3, r3 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r5 \n\t"              \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r4 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -16611,37 +16613,37 @@
     "mul r3, r5 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r4, r4 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r7 \n\t"              \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r6 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r5 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -16651,45 +16653,45 @@
     "mul r3, r7 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r4, r6 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r5, r5 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r9 \n\t"              \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r8 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r7 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r5, r6 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -16699,53 +16701,53 @@
     "mul r3, r9 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r4, r8 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r5, r7 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r6, r6 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r11 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r10 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r9 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r5, r8 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r6, r7 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -16755,61 +16757,61 @@
     "mul r3, r11 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r4, r10 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r5, r9 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r6, r8 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r7, r7 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r13 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r12 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r11 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r5, r10 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r6, r9 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r7, r8 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -16819,69 +16821,69 @@
     "mul r3, r13 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r4, r12 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r5, r11 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r6, r10 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r7, r9 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r8, r8 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r15 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r14 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r13 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r5, r12 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r6, r11 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r7, r10 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r8, r9 \n\t"              \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -16891,77 +16893,77 @@
     "mul r3, r15 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r4, r14 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r5, r13 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r6, r12 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r7, r11 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r8, r10 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r9, r9 \n\t"              \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r17 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r16 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r15 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r5, r14 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r6, r13 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r7, r12 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r8, r11 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r9, r10 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -16971,85 +16973,85 @@
     "mul r3, r17 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r4, r16 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r5, r15 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r6, r14 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r7, r13 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r8, r12 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r9, r11 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r10, r10 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r19 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r18 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r17 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r5, r16 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r6, r15 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r7, r14 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r8, r13 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r9, r12 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r10, r11 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17059,93 +17061,93 @@
     "mul r3, r19 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r4, r18 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r5, r17 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r6, r16 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r7, r15 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r8, r14 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r9, r13 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r10, r12 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r11, r11 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r2, r21 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r3, r20 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r4, r19 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r5, r18 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r6, r17 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r7, r16 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r8, r15 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r9, r14 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r10, r13 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r11, r12 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17155,89 +17157,89 @@
     "mul r4, r20 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r5, r19 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r6, r18 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r7, r17 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r8, r16 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r9, r15 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r10, r14 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r11, r13 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r12, r12 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r4, r21 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r5, r20 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r6, r19 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r7, r18 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r8, r17 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r9, r16 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r10, r15 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r11, r14 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r12, r13 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17247,81 +17249,81 @@
     "mul r6, r20 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r7, r19 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r8, r18 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r9, r17 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r10, r16 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r11, r15 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r12, r14 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r13, r13 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r6, r21 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r7, r20 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r8, r19 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r9, r18 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r10, r17 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r11, r16 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r12, r15 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r13, r14 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17331,73 +17333,73 @@
     "mul r8, r20 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r9, r19 \n\t"             \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r10, r18 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r11, r17 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r12, r16 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r13, r15 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r14, r14 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r8, r21 \n\t"             \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r9, r20 \n\t"             \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r10, r19 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r11, r18 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r12, r17 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r13, r16 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r14, r15 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17407,65 +17409,65 @@
     "mul r10, r20 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r11, r19 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r12, r18 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r13, r17 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r14, r16 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r15, r15 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r10, r21 \n\t"            \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r11, r20 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r12, r19 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r13, r18 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r14, r17 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r15, r16 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17475,57 +17477,57 @@
     "mul r12, r20 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r13, r19 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r14, r18 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r15, r17 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r16, r16 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r12, r21 \n\t"            \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r13, r20 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r14, r19 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r15, r18 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r16, r17 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17535,49 +17537,49 @@
     "mul r14, r20 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r15, r19 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r16, r18 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r17, r17 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r14, r21 \n\t"            \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r15, r20 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r16, r19 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r17, r18 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17587,41 +17589,41 @@
     "mul r16, r20 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "mul r17, r19 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r18, r18 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r16, r21 \n\t"            \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r17, r20 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "mul r18, r19 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r22, 0 \n\t"              \
@@ -17631,65 +17633,622 @@
     "mul r18, r20 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "lsl r23 \n\t"                 \
     "rol r24 \n\t"                 \
     "rol r22 \n\t"                 \
     "mul r19, r19 \n\t"            \
     "add r23, r0 \n\t"             \
     "adc r24, r1 \n\t"             \
-    "adc r22, r27 \n\t"            \
-    "add r23, r25 \n\t"            \
+    "adc r22, r25 \n\t"            \
+    "add r23, r27 \n\t"            \
     "adc r24, r26 \n\t"            \
-    "adc r22, r27 \n\t"            \
+    "adc r22, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r26, 0 \n\t"              \
     "mul r18, r21 \n\t"            \
     "mov r23, r0 \n\t"             \
-    "mov r25, r1 \n\t"             \
+    "mov r27, r1 \n\t"             \
     "mul r19, r20 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r1 \n\t"             \
+    "adc r26, r25 \n\t"            \
     "lsl r23 \n\t"                 \
-    "rol r25 \n\t"                 \
+    "rol r27 \n\t"                 \
     "rol r26 \n\t"                 \
     "add r23, r24 \n\t"            \
-    "adc r25, r22 \n\t"            \
-    "adc r26, r27 \n\t"            \
+    "adc r27, r22 \n\t"            \
+    "adc r26, r25 \n\t"            \
     "st z+, r23 \n\t"              \
                                    \
     "ldi r23, 0 \n\t"              \
     "mul r19, r21 \n\t"            \
     "lsl r0 \n\t"                  \
     "rol r1 \n\t"                  \
-    "adc r23, r27 \n\t"            \
-    "add r25, r0 \n\t"             \
+    "adc r23, r25 \n\t"            \
+    "add r27, r0 \n\t"             \
     "adc r26, r1 \n\t"             \
-    "adc r23, r27 \n\t"            \
+    "adc r23, r25 \n\t"            \
     "mul r20, r20 \n\t"            \
-    "add r25, r0 \n\t"             \
+    "add r27, r0 \n\t"             \
     "adc r26, r1 \n\t"             \
-    "adc r23, r27 \n\t"            \
-    "st z+, r25 \n\t"              \
+    "adc r23, r25 \n\t"            \
+    "st z+, r27 \n\t"              \
                                    \
-    "ldi r25, 0 \n\t"              \
+    "ldi r27, 0 \n\t"              \
     "mul r20, r21 \n\t"            \
     "lsl r0 \n\t"                  \
     "rol r1 \n\t"                  \
-    "adc r25, r27 \n\t"            \
+    "adc r27, r25 \n\t"            \
     "add r26, r0 \n\t"             \
     "adc r23, r1 \n\t"             \
-    "adc r25, r27 \n\t"            \
+    "adc r27, r25 \n\t"            \
     "st z+, r26 \n\t"              \
                                    \
     "mul r21, r21 \n\t"            \
     "add r23, r0 \n\t"             \
-    "adc r25, r1 \n\t"             \
+    "adc r27, r1 \n\t"             \
     "st z+, r23 \n\t"              \
-    "st z+, r25 \n\t"              \
-    "eor r1, r1 \n\t"
+    "st z+, r27 \n\t"              \
+    "pop r27 \n\t"                 \
+    "pop r26 \n\t"
+
+#define FAST_SQUARE_ASM_20_TO_24           \
+    "cpi r20, 20 \n\t"                     \
+    "brne 1f \n\t"                         \
+    "jmp 2f \n\t"                          \
+    "1: \n\t"                              \
+    "ld r2, x+ \n\t"                       \
+    "ld r3, x+ \n\t"                       \
+    "ld r4, x+ \n\t"                       \
+    "ld r5, x+ \n\t"                       \
+    "sbiw r26, 24 \n\t"                    \
+    "sbiw r30, 20 \n\t"                    \
+    "ld r6, x+ \n\t"                       \
+    "ld r7, x+ \n\t"                       \
+    "ld r8, x+ \n\t"                       \
+    "ld r9, x+ \n\t"                       \
+                                           \
+    "mul r2, r6 \n\t"                      \
+    "mov r10, r0 \n\t"                     \
+    "mov r11, r1 \n\t"                     \
+    "mov r12, r25 \n\t"                    \
+    "mov r13, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+                                           \
+    "mov r14, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+                                           \
+    "mov r15, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r16, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r17, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r18, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r19, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r21, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r22, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r23, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r24, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r28, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r29, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+                                           \
+    "lsl r10 \n\t"                         \
+    "rol r11 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "rol r13 \n\t"                         \
+    "rol r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "rol r16 \n\t"                         \
+    "rol r17 \n\t"                         \
+    "rol r18 \n\t"                         \
+    "rol r19 \n\t"                         \
+    "rol r21 \n\t"                         \
+    "rol r22 \n\t"                         \
+    "rol r23 \n\t"                         \
+    "rol r24 \n\t"                         \
+    "rol r28 \n\t"                         \
+    "rol r29 \n\t"                         \
+    "ld r0, z \n\t"                        \
+    "add r10, r0 \n\t"                     \
+    "st z+, r10 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r11, r0 \n\t"                     \
+    "st z+, r11 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r12, r0 \n\t"                     \
+    "st z+, r12 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r13, r0 \n\t"                     \
+    "st z+, r13 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r14, r0 \n\t"                     \
+    "st z+, r14 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r15, r0 \n\t"                     \
+    "st z+, r15 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r16, r0 \n\t"                     \
+    "st z+, r16 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r17, r0 \n\t"                     \
+    "st z+, r17 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r18, r0 \n\t"                     \
+    "st z+, r18 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r19, r0 \n\t"                     \
+    "st z+, r19 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r21, r0 \n\t"                     \
+    "st z+, r21 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r22, r0 \n\t"                     \
+    "st z+, r22 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r23, r0 \n\t"                     \
+    "st z+, r23 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r24, r0 \n\t"                     \
+    "st z+, r24 \n\t"                      \
+    "adc r28, r25 \n\t"                    \
+    "adc r29, r25 \n\t"                    \
+    "bst r28, 0 \n\t"                      \
+    "lsr r29 \n\t"                         \
+    "ror r28 \n\t"                         \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r10, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r11, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r12, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r13, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r14, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r15, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "lsl r28 \n\t"                         \
+    "bld r28, 0 \n\t"                      \
+    "rol r29 \n\t"                         \
+    "rol r10 \n\t"                         \
+    "rol r11 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "rol r13 \n\t"                         \
+    "rol r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "ld r0, z \n\t"                        \
+    "add r28, r0 \n\t"                     \
+    "st z+, r28 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r29, r0 \n\t"                     \
+    "st z+, r29 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r10, r0 \n\t"                     \
+    "st z+, r10 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r11, r0 \n\t"                     \
+    "st z+, r11 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r12, r0 \n\t"                     \
+    "st z+, r12 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r13, r0 \n\t"                     \
+    "st z+, r13 \n\t"                      \
+    "adc r14, r25 \n\t"                    \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "mul r2, r2 \n\t"                      \
+    "mov r16, r0 \n\t"                     \
+    "mov r17, r1 \n\t"                     \
+    "mul r3, r3 \n\t"                      \
+    "mov r18, r0 \n\t"                     \
+    "mov r19, r1 \n\t"                     \
+    "mul r4, r4 \n\t"                      \
+    "mov r21, r0 \n\t"                     \
+    "mov r22, r1 \n\t"                     \
+    "mul r5, r5 \n\t"                      \
+    "mov r23, r0 \n\t"                     \
+    "mov r24, r1 \n\t"                     \
+    "add r16, r14 \n\t"                    \
+    "adc r17, r15 \n\t"                    \
+    "adc r18, r25 \n\t"                    \
+    "adc r19, r25 \n\t"                    \
+                                           \
+    "mul r7, r5 \n\t"                      \
+    "mov r14, r0 \n\t"                     \
+    "mov r15, r1 \n\t"                     \
+    "mov r28, r25 \n\t"                    \
+    "mul r8, r4 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r9, r3 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mov r29, r25 \n\t"                    \
+    "mul r8, r5 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r9, r4 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r2, r3 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mov r10, r25 \n\t"                    \
+    "mul r9, r5 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r2, r4 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mov r11, r25 \n\t"                    \
+    "mul r2, r5 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r3, r4 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mov r12, r25 \n\t"                    \
+    "mul r3, r5 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r4, r5 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+                                           \
+    "lsl r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "rol r28 \n\t"                         \
+    "rol r29 \n\t"                         \
+    "rol r10 \n\t"                         \
+    "rol r11 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "adc r24, r25 \n\t"                    \
+    "add r16, r14 \n\t"                    \
+    "adc r17, r15 \n\t"                    \
+    "adc r18, r28 \n\t"                    \
+    "adc r19, r29 \n\t"                    \
+    "adc r21, r10 \n\t"                    \
+    "adc r22, r11 \n\t"                    \
+    "adc r23, r12 \n\t"                    \
+    "adc r24, r25 \n\t"                    \
+                                           \
+    "st z+, r16 \n\t"                      \
+    "st z+, r17 \n\t"                      \
+    "st z+, r18 \n\t"                      \
+    "st z+, r19 \n\t"                      \
+    "st z+, r21 \n\t"                      \
+    "st z+, r22 \n\t"                      \
+    "st z+, r23 \n\t"                      \
+    "st z+, r24 \n\t"                      \
+    "adiw r26, 4 \n\t"
 
 #define FAST_SQUARE_ASM_24             \
     "ldi r25, 0 \n\t"                  \
@@ -19335,8 +19894,655 @@
     "add r23, r0 \n\t"                 \
     "adc r28, r1 \n\t"                 \
     "st z+, r23 \n\t"                  \
-    "st z+, r28 \n\t"                  \
-    "eor r1, r1 \n\t"
+    "st z+, r28 \n\t"
+
+#define FAST_SQUARE_ASM_24_TO_28           \
+    "cpi r20, 24 \n\t"                     \
+    "brne 1f \n\t"                         \
+    "jmp 2f \n\t"                          \
+    "1: \n\t"                              \
+    "ld r2, x+ \n\t"                       \
+    "ld r3, x+ \n\t"                       \
+    "ld r4, x+ \n\t"                       \
+    "ld r5, x+ \n\t"                       \
+    "sbiw r26, 28 \n\t"                    \
+    "sbiw r30, 24 \n\t"                    \
+    "ld r6, x+ \n\t"                       \
+    "ld r7, x+ \n\t"                       \
+    "ld r8, x+ \n\t"                       \
+    "ld r9, x+ \n\t"                       \
+                                           \
+    "mul r2, r6 \n\t"                      \
+    "mov r10, r0 \n\t"                     \
+    "mov r11, r1 \n\t"                     \
+    "mov r12, r25 \n\t"                    \
+    "mov r13, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+                                           \
+    "mov r14, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+                                           \
+    "mov r15, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r16, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r17, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r18, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r19, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r21, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r22, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r23, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r24, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r28, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r29, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+                                           \
+    "lsl r10 \n\t"                         \
+    "rol r11 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "rol r13 \n\t"                         \
+    "rol r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "rol r16 \n\t"                         \
+    "rol r17 \n\t"                         \
+    "rol r18 \n\t"                         \
+    "rol r19 \n\t"                         \
+    "rol r21 \n\t"                         \
+    "rol r22 \n\t"                         \
+    "rol r23 \n\t"                         \
+    "rol r24 \n\t"                         \
+    "rol r28 \n\t"                         \
+    "rol r29 \n\t"                         \
+    "ld r0, z \n\t"                        \
+    "add r10, r0 \n\t"                     \
+    "st z+, r10 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r11, r0 \n\t"                     \
+    "st z+, r11 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r12, r0 \n\t"                     \
+    "st z+, r12 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r13, r0 \n\t"                     \
+    "st z+, r13 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r14, r0 \n\t"                     \
+    "st z+, r14 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r15, r0 \n\t"                     \
+    "st z+, r15 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r16, r0 \n\t"                     \
+    "st z+, r16 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r17, r0 \n\t"                     \
+    "st z+, r17 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r18, r0 \n\t"                     \
+    "st z+, r18 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r19, r0 \n\t"                     \
+    "st z+, r19 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r21, r0 \n\t"                     \
+    "st z+, r21 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r22, r0 \n\t"                     \
+    "st z+, r22 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r23, r0 \n\t"                     \
+    "st z+, r23 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r24, r0 \n\t"                     \
+    "st z+, r24 \n\t"                      \
+    "adc r28, r25 \n\t"                    \
+    "adc r29, r25 \n\t"                    \
+    "bst r28, 0 \n\t"                      \
+    "lsr r29 \n\t"                         \
+    "ror r28 \n\t"                         \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r10, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r11, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r12, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r13, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r14, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r15, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r16, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r17, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r18, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r19, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+                                           \
+    "lsl r28 \n\t"                         \
+    "bld r28, 0 \n\t"                      \
+    "rol r29 \n\t"                         \
+    "rol r10 \n\t"                         \
+    "rol r11 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "rol r13 \n\t"                         \
+    "rol r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "rol r16 \n\t"                         \
+    "rol r17 \n\t"                         \
+    "rol r18 \n\t"                         \
+    "rol r19 \n\t"                         \
+    "ld r0, z \n\t"                        \
+    "add r28, r0 \n\t"                     \
+    "st z+, r28 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r29, r0 \n\t"                     \
+    "st z+, r29 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r10, r0 \n\t"                     \
+    "st z+, r10 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r11, r0 \n\t"                     \
+    "st z+, r11 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r12, r0 \n\t"                     \
+    "st z+, r12 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r13, r0 \n\t"                     \
+    "st z+, r13 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r14, r0 \n\t"                     \
+    "st z+, r14 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r15, r0 \n\t"                     \
+    "st z+, r15 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r16, r0 \n\t"                     \
+    "st z+, r16 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r17, r0 \n\t"                     \
+    "st z+, r17 \n\t"                      \
+    "adc r18, r25 \n\t"                    \
+    "adc r19, r25 \n\t"                    \
+                                           \
+    "mul r2, r2 \n\t"                      \
+    "mov r21, r0 \n\t"                     \
+    "mov r22, r1 \n\t"                     \
+    "mul r3, r3 \n\t"                      \
+    "mov r23, r0 \n\t"                     \
+    "mov r24, r1 \n\t"                     \
+    "mul r4, r4 \n\t"                      \
+    "mov r28, r0 \n\t"                     \
+    "mov r29, r1 \n\t"                     \
+    "mul r5, r5 \n\t"                      \
+    "mov r10, r0 \n\t"                     \
+    "mov r11, r1 \n\t"                     \
+    "add r21, r18 \n\t"                    \
+    "adc r22, r19 \n\t"                    \
+    "adc r23, r25 \n\t"                    \
+    "adc r24, r25 \n\t"                    \
+                                           \
+    "mul r7, r5 \n\t"                      \
+    "mov r18, r0 \n\t"                     \
+    "mov r19, r1 \n\t"                     \
+    "mov r12, r25 \n\t"                    \
+    "mul r8, r4 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r9, r3 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mov r13, r25 \n\t"                    \
+    "mul r8, r5 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r9, r4 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r2, r3 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mov r14, r25 \n\t"                    \
+    "mul r9, r5 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r2, r4 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mov r15, r25 \n\t"                    \
+    "mul r2, r5 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r3, r4 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mov r16, r25 \n\t"                    \
+    "mul r3, r5 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r4, r5 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+                                           \
+    "lsl r18 \n\t"                         \
+    "rol r19 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "rol r13 \n\t"                         \
+    "rol r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "rol r16 \n\t"                         \
+    "adc r11, r25 \n\t"                    \
+    "add r21, r18 \n\t"                    \
+    "adc r22, r19 \n\t"                    \
+    "adc r23, r12 \n\t"                    \
+    "adc r24, r13 \n\t"                    \
+    "adc r28, r14 \n\t"                    \
+    "adc r29, r15 \n\t"                    \
+    "adc r10, r16 \n\t"                    \
+    "adc r11, r25 \n\t"                    \
+                                           \
+    "st z+, r21 \n\t"                      \
+    "st z+, r22 \n\t"                      \
+    "st z+, r23 \n\t"                      \
+    "st z+, r24 \n\t"                      \
+    "st z+, r28 \n\t"                      \
+    "st z+, r29 \n\t"                      \
+    "st z+, r10 \n\t"                      \
+    "st z+, r11 \n\t"                      \
+    "adiw r26, 4 \n\t"
 
 #define FAST_SQUARE_ASM_28         \
     "ldi r25, 0 \n\t"              \
@@ -21538,8 +22744,747 @@
     "add r23, r0 \n\t"             \
     "adc r28, r1 \n\t"             \
     "st z+, r23 \n\t"              \
-    "st z+, r28 \n\t"              \
-    "eor r1, r1 \n\t"
+    "st z+, r28 \n\t"
+
+#define FAST_SQUARE_ASM_28_TO_32           \
+    "cpi r20, 28 \n\t"                     \
+    "brne 1f \n\t"                         \
+    "jmp 2f \n\t"                          \
+    "1: \n\t"                              \
+    "ld r2, x+ \n\t"                       \
+    "ld r3, x+ \n\t"                       \
+    "ld r4, x+ \n\t"                       \
+    "ld r5, x+ \n\t"                       \
+    "sbiw r26, 32 \n\t"                    \
+    "sbiw r30, 28 \n\t"                    \
+    "ld r6, x+ \n\t"                       \
+    "ld r7, x+ \n\t"                       \
+    "ld r8, x+ \n\t"                       \
+    "ld r9, x+ \n\t"                       \
+                                           \
+    "mul r2, r6 \n\t"                      \
+    "mov r10, r0 \n\t"                     \
+    "mov r11, r1 \n\t"                     \
+    "mov r12, r25 \n\t"                    \
+    "mov r13, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+                                           \
+    "mov r14, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+                                           \
+    "mov r15, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r16, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r17, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r18, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r19, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r21, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r22, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r23, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r24, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r28, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r28, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r29, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r28, r1 \n\t"                     \
+    "adc r29, r25 \n\t"                    \
+                                           \
+    "lsl r10 \n\t"                         \
+    "rol r11 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "rol r13 \n\t"                         \
+    "rol r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "rol r16 \n\t"                         \
+    "rol r17 \n\t"                         \
+    "rol r18 \n\t"                         \
+    "rol r19 \n\t"                         \
+    "rol r21 \n\t"                         \
+    "rol r22 \n\t"                         \
+    "rol r23 \n\t"                         \
+    "rol r24 \n\t"                         \
+    "rol r28 \n\t"                         \
+    "rol r29 \n\t"                         \
+    "ld r0, z \n\t"                        \
+    "add r10, r0 \n\t"                     \
+    "st z+, r10 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r11, r0 \n\t"                     \
+    "st z+, r11 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r12, r0 \n\t"                     \
+    "st z+, r12 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r13, r0 \n\t"                     \
+    "st z+, r13 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r14, r0 \n\t"                     \
+    "st z+, r14 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r15, r0 \n\t"                     \
+    "st z+, r15 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r16, r0 \n\t"                     \
+    "st z+, r16 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r17, r0 \n\t"                     \
+    "st z+, r17 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r18, r0 \n\t"                     \
+    "st z+, r18 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r19, r0 \n\t"                     \
+    "st z+, r19 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r21, r0 \n\t"                     \
+    "st z+, r21 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r22, r0 \n\t"                     \
+    "st z+, r22 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r23, r0 \n\t"                     \
+    "st z+, r23 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r24, r0 \n\t"                     \
+    "st z+, r24 \n\t"                      \
+    "adc r28, r25 \n\t"                    \
+    "adc r29, r25 \n\t"                    \
+    "bst r28, 0 \n\t"                      \
+    "lsr r29 \n\t"                         \
+    "ror r28 \n\t"                         \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r10, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r28, r0 \n\t"                     \
+    "adc r29, r1 \n\t"                     \
+    "adc r10, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r11, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r29, r0 \n\t"                     \
+    "adc r10, r1 \n\t"                     \
+    "adc r11, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r12, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r10, r0 \n\t"                     \
+    "adc r11, r1 \n\t"                     \
+    "adc r12, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r13, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r11, r0 \n\t"                     \
+    "adc r12, r1 \n\t"                     \
+    "adc r13, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r14, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r12, r0 \n\t"                     \
+    "adc r13, r1 \n\t"                     \
+    "adc r14, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r15, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r13, r0 \n\t"                     \
+    "adc r14, r1 \n\t"                     \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r16, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r14, r0 \n\t"                     \
+    "adc r15, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r17, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r15, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r18, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r19, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+                                           \
+    "ld r6, x+ \n\t"                       \
+    "mov r21, r25 \n\t"                    \
+    "mul r2, r6 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r3, r9 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r4, r8 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r5, r7 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+                                           \
+    "ld r7, x+ \n\t"                       \
+    "mov r22, r25 \n\t"                    \
+    "mul r2, r7 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r3, r6 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r4, r9 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+    "mul r5, r8 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+    "adc r22, r25 \n\t"                    \
+                                           \
+    "ld r8, x+ \n\t"                       \
+    "mov r23, r25 \n\t"                    \
+    "mul r2, r8 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r3, r7 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r4, r6 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+    "mul r5, r9 \n\t"                      \
+    "add r21, r0 \n\t"                     \
+    "adc r22, r1 \n\t"                     \
+    "adc r23, r25 \n\t"                    \
+                                           \
+    "ld r9, x+ \n\t"                       \
+    "mov r24, r25 \n\t"                    \
+    "mul r2, r9 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r3, r8 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r4, r7 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+    "mul r5, r6 \n\t"                      \
+    "add r22, r0 \n\t"                     \
+    "adc r23, r1 \n\t"                     \
+    "adc r24, r25 \n\t"                    \
+                                           \
+    "lsl r28 \n\t"                         \
+    "bld r28, 0 \n\t"                      \
+    "rol r29 \n\t"                         \
+    "rol r10 \n\t"                         \
+    "rol r11 \n\t"                         \
+    "rol r12 \n\t"                         \
+    "rol r13 \n\t"                         \
+    "rol r14 \n\t"                         \
+    "rol r15 \n\t"                         \
+    "rol r16 \n\t"                         \
+    "rol r17 \n\t"                         \
+    "rol r18 \n\t"                         \
+    "rol r19 \n\t"                         \
+    "rol r21 \n\t"                         \
+    "rol r22 \n\t"                         \
+    "rol r23 \n\t"                         \
+    "rol r24 \n\t"                         \
+    "ld r0, z \n\t"                        \
+    "add r28, r0 \n\t"                     \
+    "st z+, r28 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r29, r0 \n\t"                     \
+    "st z+, r29 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r10, r0 \n\t"                     \
+    "st z+, r10 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r11, r0 \n\t"                     \
+    "st z+, r11 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r12, r0 \n\t"                     \
+    "st z+, r12 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r13, r0 \n\t"                     \
+    "st z+, r13 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r14, r0 \n\t"                     \
+    "st z+, r14 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r15, r0 \n\t"                     \
+    "st z+, r15 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r16, r0 \n\t"                     \
+    "st z+, r16 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r17, r0 \n\t"                     \
+    "st z+, r17 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r18, r0 \n\t"                     \
+    "st z+, r18 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r19, r0 \n\t"                     \
+    "st z+, r19 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r21, r0 \n\t"                     \
+    "st z+, r21 \n\t"                      \
+    "ld r0, z \n\t"                        \
+    "adc r22, r0 \n\t"                     \
+    "st z+, r22 \n\t"                      \
+    "adc r23, r25 \n\t"                    \
+    "adc r24, r25 \n\t"                    \
+                                           \
+    "mul r2, r2 \n\t"                      \
+    "mov r28, r0 \n\t"                     \
+    "mov r29, r1 \n\t"                     \
+    "mul r3, r3 \n\t"                      \
+    "mov r10, r0 \n\t"                     \
+    "mov r11, r1 \n\t"                     \
+    "mul r4, r4 \n\t"                      \
+    "mov r12, r0 \n\t"                     \
+    "mov r13, r1 \n\t"                     \
+    "mul r5, r5 \n\t"                      \
+    "mov r14, r0 \n\t"                     \
+    "mov r15, r1 \n\t"                     \
+    "add r28, r23 \n\t"                    \
+    "adc r29, r24 \n\t"                    \
+    "adc r10, r25 \n\t"                    \
+    "adc r11, r25 \n\t"                    \
+                                           \
+    "mul r7, r5 \n\t"                      \
+    "mov r23, r0 \n\t"                     \
+    "mov r24, r1 \n\t"                     \
+    "mov r16, r25 \n\t"                    \
+    "mul r8, r4 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mul r9, r3 \n\t"                      \
+    "add r23, r0 \n\t"                     \
+    "adc r24, r1 \n\t"                     \
+    "adc r16, r25 \n\t"                    \
+    "mov r17, r25 \n\t"                    \
+    "mul r8, r5 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r9, r4 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mul r2, r3 \n\t"                      \
+    "add r24, r0 \n\t"                     \
+    "adc r16, r1 \n\t"                     \
+    "adc r17, r25 \n\t"                    \
+    "mov r18, r25 \n\t"                    \
+    "mul r9, r5 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mul r2, r4 \n\t"                      \
+    "add r16, r0 \n\t"                     \
+    "adc r17, r1 \n\t"                     \
+    "adc r18, r25 \n\t"                    \
+    "mov r19, r25 \n\t"                    \
+    "mul r2, r5 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mul r3, r4 \n\t"                      \
+    "add r17, r0 \n\t"                     \
+    "adc r18, r1 \n\t"                     \
+    "adc r19, r25 \n\t"                    \
+    "mov r21, r25 \n\t"                    \
+    "mul r3, r5 \n\t"                      \
+    "add r18, r0 \n\t"                     \
+    "adc r19, r1 \n\t"                     \
+    "adc r21, r25 \n\t"                    \
+    "mul r4, r5 \n\t"                      \
+    "add r19, r0 \n\t"                     \
+    "adc r21, r1 \n\t"                     \
+                                           \
+    "lsl r23 \n\t"                         \
+    "rol r24 \n\t"                         \
+    "rol r16 \n\t"                         \
+    "rol r17 \n\t"                         \
+    "rol r18 \n\t"                         \
+    "rol r19 \n\t"                         \
+    "rol r21 \n\t"                         \
+    "adc r15, r25 \n\t"                    \
+    "add r28, r23 \n\t"                    \
+    "adc r29, r24 \n\t"                    \
+    "adc r10, r16 \n\t"                    \
+    "adc r11, r17 \n\t"                    \
+    "adc r12, r18 \n\t"                    \
+    "adc r13, r19 \n\t"                    \
+    "adc r14, r21 \n\t"                    \
+    "adc r15, r25 \n\t"                    \
+                                           \
+    "st z+, r28 \n\t"                      \
+    "st z+, r29 \n\t"                      \
+    "st z+, r10 \n\t"                      \
+    "st z+, r11 \n\t"                      \
+    "st z+, r12 \n\t"                      \
+    "st z+, r13 \n\t"                      \
+    "st z+, r14 \n\t"                      \
+    "st z+, r15 \n\t"                      \
+    "adiw r26, 4 \n\t"
 
 #define FAST_SQUARE_ASM_32             \
     "ldi r25, 0 \n\t"                  \
@@ -24361,7 +26306,6 @@
     "add r23, r0 \n\t"                 \
     "adc r28, r1 \n\t"                 \
     "st z+, r23 \n\t"                  \
-    "st z+, r28 \n\t"                  \
-    "eor r1, r1 \n\t"
+    "st z+, r28 \n\t"
 
 #endif /* _UECC_ASM_AVR_MULT_SQUARE_H_ */