Add fast multiply asm for AVR (#50)
diff --git a/asm_arm.inc b/asm_arm.inc
index cd235b8..5f07264 100644
--- a/asm_arm.inc
+++ b/asm_arm.inc
@@ -3,8 +3,6 @@
#ifndef _UECC_ASM_ARM_H_
#define _UECC_ASM_ARM_H_
-#include "asm_arm_mult_square.inc"
-
#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
#define uECC_MIN_WORDS 8
#endif
@@ -158,6 +156,8 @@
#if (uECC_OPTIMIZATION_LEVEL >= 3)
+#include "asm_arm_mult_square.inc"
+
#define FAST_MULT_ASM_5_TO_6 \
"cmp r3, #5 \n\t" \
"beq 1f \n\t" \
diff --git a/asm_avr.inc b/asm_avr.inc
index c99bf82..0d4582f 100644
--- a/asm_avr.inc
+++ b/asm_avr.inc
@@ -3,6 +3,22 @@
#ifndef _UECC_ASM_AVR_H_
#define _UECC_ASM_AVR_H_
+#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
+ #define uECC_MIN_WORDS 32
+#endif
+#if uECC_SUPPORTS_secp224r1
+ #undef uECC_MIN_WORDS
+ #define uECC_MIN_WORDS 28
+#endif
+#if uECC_SUPPORTS_secp192r1
+ #undef uECC_MIN_WORDS
+ #define uECC_MIN_WORDS 24
+#endif
+#if uECC_SUPPORTS_secp160r1
+ #undef uECC_MIN_WORDS
+ #define uECC_MIN_WORDS 20
+#endif
+
#if __AVR_HAVE_EIJMP_EICALL__
#define IJMP "eijmp \n\t"
#else
@@ -189,6 +205,64 @@
}
#define asm_sub 1
+#if (uECC_OPTIMIZATION_LEVEL >= 3)
+
+#include "asm_avr_mult_square.inc"
+
+__attribute((noinline))
+uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
+ const uECC_word_t *left,
+ const uECC_word_t *right,
+ wordcount_t num_words) {
+ /* num_words should already be in r18. */
+ register wordcount_t r18 __asm__("r18") = num_words;
+
+ __asm__ volatile (
+ "push r18 \n\t"
+#if (uECC_MIN_WORDS == 20)
+ FAST_MULT_ASM_20
+ "pop r18 \n\t"
+ #if (uECC_MAX_WORDS > 20)
+ FAST_MULT_ASM_20_TO_24
+ #endif
+ #if (uECC_MAX_WORDS > 24)
+ FAST_MULT_ASM_24_TO_28
+ #endif
+ #if (uECC_MAX_WORDS > 28)
+ FAST_MULT_ASM_28_TO_32
+ #endif
+#elif (uECC_MIN_WORDS == 24)
+ FAST_MULT_ASM_24
+ "pop r18 \n\t"
+ #if (uECC_MAX_WORDS > 24)
+ FAST_MULT_ASM_24_TO_28
+ #endif
+ #if (uECC_MAX_WORDS > 28)
+ FAST_MULT_ASM_28_TO_32
+ #endif
+#elif (uECC_MIN_WORDS == 28)
+ FAST_MULT_ASM_28
+ "pop r18 \n\t"
+ #if (uECC_MAX_WORDS > 28)
+ FAST_MULT_ASM_28_TO_32
+ #endif
+#elif (uECC_MIN_WORDS == 32)
+ FAST_MULT_ASM_32
+ "pop r18 \n\t"
+#endif
+ "done: \n\t"
+ "eor r1, r1 \n\t"
+ : "+x" (left), "+y" (right), "+z" (result)
+ : "r" (r18)
+ : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
+ "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+ );
+}
+#define asm_mult 1
+
+#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
+
#if uECC_SUPPORTS_secp160r1
static const struct uECC_Curve_t curve_secp160r1;
static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
@@ -704,6 +778,8 @@
#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
+/* ---- "Small" implementations ---- */
+
#if !asm_add
uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
const uECC_word_t *left,
diff --git a/asm_avr_mult_square.inc b/asm_avr_mult_square.inc
index 5581bb4..687c618 100644
--- a/asm_avr_mult_square.inc
+++ b/asm_avr_mult_square.inc
@@ -3,7 +3,7 @@
#ifndef _UECC_ASM_AVR_MULT_SQUARE_H_
#define _UECC_ASM_AVR_MULT_SQUARE_H_
-#define FAST_MULT_ASM_5 \
+#define FAST_MULT_ASM_20 \
"adiw r30, 10 \n\t" \
"adiw r28, 10 \n\t" \
"ld r2, x+ \n\t" \
@@ -1905,10 +1905,911 @@
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
"st z+, r23 \n\t" \
- "st z+, r24 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r24 \n\t"
-#define FAST_MULT_ASM_6 \
+#define FAST_MULT_ASM_20_TO_24 \
+ "cpi r18, 20 \n\t" \
+ "brne 1f \n\t" \
+ "jmp done \n\t" \
+ "1: \n\t" \
+ "ld r2, x+ \n\t" \
+ "ld r6, y+ \n\t" \
+ "ld r3, x+ \n\t" \
+ "ld r7, y+ \n\t" \
+ "ld r4, x+ \n\t" \
+ "ld r8, y+ \n\t" \
+ "ld r5, x+ \n\t" \
+ "ld r9, y+ \n\t" \
+ "sbiw r26, 24 \n\t" \
+ "sbiw r28, 24 \n\t" \
+ "sbiw r30, 20 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ \
+ "mul r2, r14 \n\t" \
+ "mov r19, r0 \n\t" \
+ "mov r20, r1 \n\t" \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "mul r11, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r12, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r13, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "mul r12, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r13, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "mul r13, r9 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "mul r5, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "st z+, r21 \n\t" \
+ "st z+, r19 \n\t" \
+ "adiw r26, 4 \n\t" \
+ "adiw r28, 4 \n\t"
+
+#define FAST_MULT_ASM_24 \
"adiw r30, 20 \n\t" \
"adiw r28, 20 \n\t" \
"ld r2, x+ \n\t" \
@@ -4677,10 +5578,1071 @@
"add r22, r0 \n\t" \
"adc r23, r1 \n\t" \
"st z+, r22 \n\t" \
- "st z+, r23 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r23 \n\t"
-#define FAST_MULT_ASM_7 \
+#define FAST_MULT_ASM_24_TO_28 \
+ "cpi r18, 24 \n\t" \
+ "brne 1f \n\t" \
+ "jmp done \n\t" \
+ "1: \n\t" \
+ "ld r2, x+ \n\t" \
+ "ld r6, y+ \n\t" \
+ "ld r3, x+ \n\t" \
+ "ld r7, y+ \n\t" \
+ "ld r4, x+ \n\t" \
+ "ld r8, y+ \n\t" \
+ "ld r5, x+ \n\t" \
+ "ld r9, y+ \n\t" \
+ "sbiw r26, 28 \n\t" \
+ "sbiw r28, 28 \n\t" \
+ "sbiw r30, 24 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ \
+ "mul r2, r14 \n\t" \
+ "mov r19, r0 \n\t" \
+ "mov r20, r1 \n\t" \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "mul r11, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r12, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r13, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "mul r12, r9 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r13, r8 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "mul r13, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "mul r5, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "st z+, r19 \n\t" \
+ "st z+, r20 \n\t" \
+ "adiw r26, 4 \n\t" \
+ "adiw r28, 4 \n\t"
+
+#define FAST_MULT_ASM_28 \
"adiw r30, 20 \n\t" \
"adiw r28, 20 \n\t" \
"ld r2, x+ \n\t" \
@@ -8437,10 +10399,1230 @@
"add r24, r0 \n\t" \
"adc r22, r1 \n\t" \
"st z+, r24 \n\t" \
- "st z+, r22 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r22 \n\t"
-#define FAST_MULT_ASM_8 \
+#define FAST_MULT_ASM_28_TO_32 \
+ "cpi r18, 28 \n\t" \
+ "brne 1f \n\t" \
+ "jmp done \n\t" \
+ "1: \n\t" \
+ "ld r2, x+ \n\t" \
+ "ld r6, y+ \n\t" \
+ "ld r3, x+ \n\t" \
+ "ld r7, y+ \n\t" \
+ "ld r4, x+ \n\t" \
+ "ld r8, y+ \n\t" \
+ "ld r5, x+ \n\t" \
+ "ld r9, y+ \n\t" \
+ "sbiw r26, 32 \n\t" \
+ "sbiw r28, 32 \n\t" \
+ "sbiw r30, 28 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ \
+ "mul r2, r14 \n\t" \
+ "mov r19, r0 \n\t" \
+ "mov r20, r1 \n\t" \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r10, x+ \n\t" \
+ "ld r14, y+ \n\t" \
+ "mul r2, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r25 \n\t" \
+ "ld r11, x+ \n\t" \
+ "ld r15, y+ \n\t" \
+ "mul r2, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r6, r11 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r14 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r7, r10 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r8, r13 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r9, r12 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r25 \n\t" \
+ "ld r12, x+ \n\t" \
+ "ld r16, y+ \n\t" \
+ "mul r2, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r6, r12 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r15 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r7, r11 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r14 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r8, r10 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r9, r13 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "ld r0, z \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r25 \n\t" \
+ "ld r13, x+ \n\t" \
+ "ld r17, y+ \n\t" \
+ "mul r2, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r6, r13 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r16 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r7, r12 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r15 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r8, r11 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r14 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r9, r10 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "mul r11, r9 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r12, r8 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r13, r7 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r17 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r16 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r15 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "mul r12, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r13, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r17 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r16 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "mul r13, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r17 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "ldi r19, 0 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "st z+, r20 \n\t" \
+ \
+ "ldi r20, 0 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r20, r25 \n\t" \
+ "st z+, r21 \n\t" \
+ \
+ "ldi r21, 0 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r20, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "st z+, r19 \n\t" \
+ \
+ "mul r5, r9 \n\t" \
+ "add r20, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "st z+, r20 \n\t" \
+ "st z+, r21 \n\t"
+ /* Not necessary to move ptrs since we don't support sizes > 32 */
+
+#define FAST_MULT_ASM_32 \
"adiw r30, 30 \n\t" \
"adiw r28, 30 \n\t" \
"ld r2, x+ \n\t" \
@@ -13352,10 +16534,9 @@
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
"st z+, r23 \n\t" \
- "st z+, r24 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r24 \n\t"
-#define FAST_SQUARE_ASM_5 \
+#define FAST_SQUARE_ASM_20 \
"ld r2, x+ \n\t" \
"ld r3, x+ \n\t" \
"ld r4, x+ \n\t" \
@@ -14510,7 +17691,7 @@
"st z+, r25 \n\t" \
"eor r1, r1 \n\t"
-#define FAST_SQUARE_ASM_6 \
+#define FAST_SQUARE_ASM_24 \
"ldi r25, 0 \n\t" \
"movw r28, r26 \n\t" \
"ld r2, x+ \n\t" \
@@ -16157,7 +19338,7 @@
"st z+, r28 \n\t" \
"eor r1, r1 \n\t"
-#define FAST_SQUARE_ASM_7 \
+#define FAST_SQUARE_ASM_28 \
"ldi r25, 0 \n\t" \
"movw r28, r26 \n\t" \
"ld r2, x+ \n\t" \
@@ -18360,7 +21541,7 @@
"st z+, r28 \n\t" \
"eor r1, r1 \n\t"
-#define FAST_SQUARE_ASM_8 \
+#define FAST_SQUARE_ASM_32 \
"ldi r25, 0 \n\t" \
"movw r28, r26 \n\t" \
"ld r2, x+ \n\t" \
diff --git a/scripts/mult_avr_extra.py b/scripts/mult_avr_extra.py
new file mode 100755
index 0000000..f6e654f
--- /dev/null
+++ b/scripts/mult_avr_extra.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+import sys
+
+if len(sys.argv) < 2:
+ print "Provide the integer size in bytes"
+ sys.exit(1)
+
+size = int(sys.argv[1])
+
+def lhi(i):
+ return i + 2
+
+def rhi(i):
+ return i + 6
+
+left_lo = [10, 11, 12, 13]
+right_lo = [14, 15, 16, 17]
+
+def llo(i):
+ return left_lo[i]
+
+def rlo(i):
+ return right_lo[i]
+
+def emit(line, *args):
+ s = '"' + line + r' \n\t"'
+ print s % args
+
+def update_low():
+ global left_lo
+ global right_lo
+ left_lo = left_lo[1:] + left_lo[:1]
+ right_lo = right_lo[1:] + right_lo[:1]
+ emit("ld r%s, x+", left_lo[3])
+ emit("ld r%s, y+", right_lo[3])
+
+accum = [19, 20, 21]
+
+def acc(i):
+ return accum[i]
+
+def rotate_acc():
+ global accum
+ accum = accum[1:] + accum[:1]
+
+# Load high values
+for i in xrange(4):
+ emit("ld r%s, x+", lhi(i))
+ emit("ld r%s, y+", rhi(i))
+
+emit("sbiw r26, %s", size + 4)
+emit("sbiw r28, %s", size + 4)
+emit("sbiw r30, %s", size)
+
+# Load low values
+for i in xrange(4):
+ emit("ld r%s, x+", llo(i))
+ emit("ld r%s, y+", rlo(i))
+print ""
+
+# Compute initial triangles
+emit("mul r%s, r%s", lhi(0), rlo(0))
+emit("mov r%s, r0", acc(0))
+emit("mov r%s, r1", acc(1))
+emit("ldi r%s, 0", acc(2))
+emit("ld r0, z")
+emit("add r%s, r0", acc(0))
+emit("adc r%s, r25", acc(1))
+emit("mul r%s, r%s", rhi(0), llo(0))
+emit("add r%s, r0", acc(0))
+emit("adc r%s, r1", acc(1))
+emit("adc r%s, r25", acc(2))
+emit("st z+, r%s", acc(0))
+print ""
+rotate_acc()
+
+for i in xrange(1, 4):
+ emit("ldi r%s, 0", acc(2))
+ emit("ld r0, z")
+ emit("add r%s, r0", acc(0))
+ emit("adc r%s, r25", acc(1))
+ for j in xrange(i + 1):
+ emit("mul r%s, r%s", lhi(j), rlo(i-j))
+ emit("add r%s, r0", acc(0))
+ emit("adc r%s, r1", acc(1))
+ emit("adc r%s, r25", acc(2))
+ emit("mul r%s, r%s", rhi(j), llo(i-j))
+ emit("add r%s, r0", acc(0))
+ emit("adc r%s, r1", acc(1))
+ emit("adc r%s, r25", acc(2))
+ emit("st z+, r%s", acc(0))
+ print ""
+ rotate_acc()
+
+# Compute rows overlapping old block
+for i in xrange(4, size):
+ emit("ldi r%s, 0", acc(2))
+ emit("ld r0, z")
+ emit("add r%s, r0", acc(0))
+ emit("adc r%s, r25", acc(1))
+ update_low()
+ for j in xrange(4):
+ emit("mul r%s, r%s", lhi(j), rlo(3-j))
+ emit("add r%s, r0", acc(0))
+ emit("adc r%s, r1", acc(1))
+ emit("adc r%s, r25", acc(2))
+ emit("mul r%s, r%s", rhi(j), llo(3-j))
+ emit("add r%s, r0", acc(0))
+ emit("adc r%s, r1", acc(1))
+ emit("adc r%s, r25", acc(2))
+ emit("st z+, r%s", acc(0))
+ print ""
+ rotate_acc()
+
+# Compute new triangle
+left_combined = [llo(1), llo(2), llo(3), lhi(0), lhi(1), lhi(2), lhi(3)]
+right_combined = [rlo(1), rlo(2), rlo(3), rhi(0), rhi(1), rhi(2), rhi(3)]
+
+def left(i):
+ return left_combined[i]
+
+def right(i):
+ return right_combined[i]
+
+for i in xrange(6):
+ emit("ldi r%s, 0", acc(2))
+ for j in xrange(7 - i):
+ emit("mul r%s, r%s", left(i+j), right(6-j))
+ emit("add r%s, r0", acc(0))
+ emit("adc r%s, r1", acc(1))
+ emit("adc r%s, r25", acc(2))
+ emit("st z+, r%s", acc(0))
+ print ""
+ rotate_acc()
+
+emit("mul r%s, r%s", left(6), right(6))
+emit("add r%s, r0", acc(0))
+emit("adc r%s, r1", acc(1))
+emit("st z+, r%s", acc(0))
+emit("st z+, r%s", acc(1))
+emit("adiw r26, 4")
+emit("adiw r28, 4")