Add fast square asm for AVR (#50)
diff --git a/asm_avr.inc b/asm_avr.inc
index b6c4a41..c988040 100644
--- a/asm_avr.inc
+++ b/asm_avr.inc
@@ -250,17 +250,70 @@
FAST_MULT_ASM_32
"pop r18 \n\t"
#endif
- "done: \n\t"
+ "2: \n\t"
"eor r1, r1 \n\t"
: "+x" (left), "+y" (right), "+z" (result)
: "r" (r18)
: "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
"r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
- "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+ "r21", "r22", "r23", "r24", "r25", "cc"
);
}
#define asm_mult 1
+#if uECC_SQUARE_FUNC
+__attribute((noinline))
+uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
+ const uECC_word_t *left,
+ wordcount_t num_words) {
+ /* num_words should already be in r20. */
+ register wordcount_t r20 __asm__("r20") = num_words;
+
+ __asm__ volatile (
+ "push r20 \n\t"
+#if (uECC_MIN_WORDS == 20)
+ FAST_SQUARE_ASM_20
+ "pop r20 \n\t"
+ #if (uECC_MAX_WORDS > 20)
+ FAST_SQUARE_ASM_20_TO_24
+ #endif
+ #if (uECC_MAX_WORDS > 24)
+ FAST_SQUARE_ASM_24_TO_28
+ #endif
+ #if (uECC_MAX_WORDS > 28)
+ FAST_SQUARE_ASM_28_TO_32
+ #endif
+#elif (uECC_MIN_WORDS == 24)
+ FAST_SQUARE_ASM_24
+ "pop r20 \n\t"
+ #if (uECC_MAX_WORDS > 24)
+ FAST_SQUARE_ASM_24_TO_28
+ #endif
+ #if (uECC_MAX_WORDS > 28)
+ FAST_SQUARE_ASM_28_TO_32
+ #endif
+#elif (uECC_MIN_WORDS == 28)
+ FAST_SQUARE_ASM_28
+ "pop r20 \n\t"
+ #if (uECC_MAX_WORDS > 28)
+ FAST_SQUARE_ASM_28_TO_32
+ #endif
+#elif (uECC_MIN_WORDS == 32)
+ FAST_SQUARE_ASM_32
+ "pop r20 \n\t"
+#endif
+ "2: \n\t"
+ "eor r1, r1 \n\t"
+ : "+x" (left), "+z" (result)
+ : "r" (r20)
+ : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19",
+ "r21", "r22", "r23", "r24", "r25", "r28", "r29", "cc"
+ );
+}
+#define asm_square 1
+#endif /* uECC_SQUARE_FUNC */
+
#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
#if uECC_SUPPORTS_secp160r1
@@ -434,7 +487,7 @@
: "+x" (product), [carry] "+r" (carry)
: "y" (result)
- : "r0", "r18", "r19", "r30", "r31", "cc", "memory"
+ : "r0", "r18", "r19", "r30", "r31", "cc"
);
if (carry > 0) {
@@ -759,7 +812,7 @@
: "+x" (product), [carry] "+r" (carry)
: "y" (result)
- : "r0", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r30", "r31", "cc", "memory"
+ : "r0", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r30", "r31", "cc"
);
if (carry > 0) {
diff --git a/asm_avr_mult_square.inc b/asm_avr_mult_square.inc
index 687c618..7ae08bc 100644
--- a/asm_avr_mult_square.inc
+++ b/asm_avr_mult_square.inc
@@ -1910,7 +1910,7 @@
#define FAST_MULT_ASM_20_TO_24 \
"cpi r18, 20 \n\t" \
"brne 1f \n\t" \
- "jmp done \n\t" \
+ "jmp 2f \n\t" \
"1: \n\t" \
"ld r2, x+ \n\t" \
"ld r6, y+ \n\t" \
@@ -5583,7 +5583,7 @@
#define FAST_MULT_ASM_24_TO_28 \
"cpi r18, 24 \n\t" \
"brne 1f \n\t" \
- "jmp done \n\t" \
+ "jmp 2f \n\t" \
"1: \n\t" \
"ld r2, x+ \n\t" \
"ld r6, y+ \n\t" \
@@ -10404,7 +10404,7 @@
#define FAST_MULT_ASM_28_TO_32 \
"cpi r18, 28 \n\t" \
"brne 1f \n\t" \
- "jmp done \n\t" \
+ "jmp 2f \n\t" \
"1: \n\t" \
"ld r2, x+ \n\t" \
"ld r6, y+ \n\t" \
@@ -16557,7 +16557,9 @@
"ld r19, x+ \n\t" \
"ld r20, x+ \n\t" \
"ld r21, x+ \n\t" \
- "ldi r27, 0 \n\t" \
+ "push r26 \n\t" \
+ "push r27 \n\t" \
+ "ldi r25, 0 \n\t" \
\
"ldi r23, 0 \n\t" \
"mul r2, r2 \n\t" \
@@ -16568,40 +16570,40 @@
"mul r2, r3 \n\t" \
"lsl r0 \n\t" \
"rol r1 \n\t" \
- "adc r24, r27 \n\t" \
+ "adc r24, r25 \n\t" \
"add r22, r0 \n\t" \
"adc r23, r1 \n\t" \
- "adc r24, r27 \n\t" \
+ "adc r24, r25 \n\t" \
"st z+, r22 \n\t" \
\
"ldi r22, 0 \n\t" \
"mul r2, r4 \n\t" \
"lsl r0 \n\t" \
"rol r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r3, r3 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r5 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r4 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -16611,37 +16613,37 @@
"mul r3, r5 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r4, r4 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r7 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r6 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r5 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -16651,45 +16653,45 @@
"mul r3, r7 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r4, r6 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r5, r5 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r9 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r8 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r7 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r5, r6 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -16699,53 +16701,53 @@
"mul r3, r9 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r4, r8 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r5, r7 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r6, r6 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r11 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r10 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r9 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r5, r8 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r6, r7 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -16755,61 +16757,61 @@
"mul r3, r11 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r4, r10 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r5, r9 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r6, r8 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r7, r7 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r13 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r12 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r11 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r5, r10 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r6, r9 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r7, r8 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -16819,69 +16821,69 @@
"mul r3, r13 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r4, r12 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r5, r11 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r6, r10 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r7, r9 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r8, r8 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r15 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r14 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r13 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r5, r12 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r6, r11 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r7, r10 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r8, r9 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -16891,77 +16893,77 @@
"mul r3, r15 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r4, r14 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r5, r13 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r6, r12 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r7, r11 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r8, r10 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r9, r9 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r17 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r16 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r15 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r5, r14 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r6, r13 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r7, r12 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r8, r11 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r9, r10 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -16971,85 +16973,85 @@
"mul r3, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r4, r16 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r5, r15 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r6, r14 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r7, r13 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r8, r12 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r9, r11 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r10, r10 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r19 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r17 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r5, r16 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r6, r15 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r7, r14 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r8, r13 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r9, r12 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r10, r11 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17059,93 +17061,93 @@
"mul r3, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r4, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r5, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r6, r16 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r7, r15 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r8, r14 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r9, r13 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r10, r12 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r11, r11 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r2, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r3, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r4, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r5, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r6, r17 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r7, r16 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r8, r15 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r9, r14 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r10, r13 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r11, r12 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17155,89 +17157,89 @@
"mul r4, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r5, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r6, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r7, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r8, r16 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r9, r15 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r10, r14 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r11, r13 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r12, r12 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r4, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r5, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r6, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r7, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r8, r17 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r9, r16 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r10, r15 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r11, r14 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r12, r13 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17247,81 +17249,81 @@
"mul r6, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r7, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r8, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r9, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r10, r16 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r11, r15 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r12, r14 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r13, r13 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r6, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r7, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r8, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r9, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r10, r17 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r11, r16 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r12, r15 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r13, r14 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17331,73 +17333,73 @@
"mul r8, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r9, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r10, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r11, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r12, r16 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r13, r15 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r14, r14 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r8, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r9, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r10, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r11, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r12, r17 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r13, r16 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r14, r15 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17407,65 +17409,65 @@
"mul r10, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r11, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r12, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r13, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r14, r16 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r15, r15 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r10, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r11, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r12, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r13, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r14, r17 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r15, r16 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17475,57 +17477,57 @@
"mul r12, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r13, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r14, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r15, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r16, r16 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r12, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r13, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r14, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r15, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r16, r17 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17535,49 +17537,49 @@
"mul r14, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r15, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r16, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r17, r17 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r14, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r15, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r16, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r17, r18 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17587,41 +17589,41 @@
"mul r16, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"mul r17, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r18, r18 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r16, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r17, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"mul r18, r19 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r22, 0 \n\t" \
@@ -17631,65 +17633,622 @@
"mul r18, r20 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"lsl r23 \n\t" \
"rol r24 \n\t" \
"rol r22 \n\t" \
"mul r19, r19 \n\t" \
"add r23, r0 \n\t" \
"adc r24, r1 \n\t" \
- "adc r22, r27 \n\t" \
- "add r23, r25 \n\t" \
+ "adc r22, r25 \n\t" \
+ "add r23, r27 \n\t" \
"adc r24, r26 \n\t" \
- "adc r22, r27 \n\t" \
+ "adc r22, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r26, 0 \n\t" \
"mul r18, r21 \n\t" \
"mov r23, r0 \n\t" \
- "mov r25, r1 \n\t" \
+ "mov r27, r1 \n\t" \
"mul r19, r20 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r1 \n\t" \
+ "adc r26, r25 \n\t" \
"lsl r23 \n\t" \
- "rol r25 \n\t" \
+ "rol r27 \n\t" \
"rol r26 \n\t" \
"add r23, r24 \n\t" \
- "adc r25, r22 \n\t" \
- "adc r26, r27 \n\t" \
+ "adc r27, r22 \n\t" \
+ "adc r26, r25 \n\t" \
"st z+, r23 \n\t" \
\
"ldi r23, 0 \n\t" \
"mul r19, r21 \n\t" \
"lsl r0 \n\t" \
"rol r1 \n\t" \
- "adc r23, r27 \n\t" \
- "add r25, r0 \n\t" \
+ "adc r23, r25 \n\t" \
+ "add r27, r0 \n\t" \
"adc r26, r1 \n\t" \
- "adc r23, r27 \n\t" \
+ "adc r23, r25 \n\t" \
"mul r20, r20 \n\t" \
- "add r25, r0 \n\t" \
+ "add r27, r0 \n\t" \
"adc r26, r1 \n\t" \
- "adc r23, r27 \n\t" \
- "st z+, r25 \n\t" \
+ "adc r23, r25 \n\t" \
+ "st z+, r27 \n\t" \
\
- "ldi r25, 0 \n\t" \
+ "ldi r27, 0 \n\t" \
"mul r20, r21 \n\t" \
"lsl r0 \n\t" \
"rol r1 \n\t" \
- "adc r25, r27 \n\t" \
+ "adc r27, r25 \n\t" \
"add r26, r0 \n\t" \
"adc r23, r1 \n\t" \
- "adc r25, r27 \n\t" \
+ "adc r27, r25 \n\t" \
"st z+, r26 \n\t" \
\
"mul r21, r21 \n\t" \
"add r23, r0 \n\t" \
- "adc r25, r1 \n\t" \
+ "adc r27, r1 \n\t" \
"st z+, r23 \n\t" \
- "st z+, r25 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r27 \n\t" \
+ "pop r27 \n\t" \
+ "pop r26 \n\t"
+
+#define FAST_SQUARE_ASM_20_TO_24 \
+ "cpi r20, 20 \n\t" \
+ "brne 1f \n\t" \
+ "jmp 2f \n\t" \
+ "1: \n\t" \
+ "ld r2, x+ \n\t" \
+ "ld r3, x+ \n\t" \
+ "ld r4, x+ \n\t" \
+ "ld r5, x+ \n\t" \
+ "sbiw r26, 24 \n\t" \
+ "sbiw r30, 20 \n\t" \
+ "ld r6, x+ \n\t" \
+ "ld r7, x+ \n\t" \
+ "ld r8, x+ \n\t" \
+ "ld r9, x+ \n\t" \
+ \
+ "mul r2, r6 \n\t" \
+ "mov r10, r0 \n\t" \
+ "mov r11, r1 \n\t" \
+ "mov r12, r25 \n\t" \
+ "mov r13, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ \
+ "mov r14, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ \
+ "mov r15, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r16, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r17, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r18, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r19, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r21, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r22, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r23, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r24, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r28, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r29, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ \
+ "lsl r10 \n\t" \
+ "rol r11 \n\t" \
+ "rol r12 \n\t" \
+ "rol r13 \n\t" \
+ "rol r14 \n\t" \
+ "rol r15 \n\t" \
+ "rol r16 \n\t" \
+ "rol r17 \n\t" \
+ "rol r18 \n\t" \
+ "rol r19 \n\t" \
+ "rol r21 \n\t" \
+ "rol r22 \n\t" \
+ "rol r23 \n\t" \
+ "rol r24 \n\t" \
+ "rol r28 \n\t" \
+ "rol r29 \n\t" \
+ "ld r0, z \n\t" \
+ "add r10, r0 \n\t" \
+ "st z+, r10 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r11, r0 \n\t" \
+ "st z+, r11 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r12, r0 \n\t" \
+ "st z+, r12 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r13, r0 \n\t" \
+ "st z+, r13 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r14, r0 \n\t" \
+ "st z+, r14 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r15, r0 \n\t" \
+ "st z+, r15 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r16, r0 \n\t" \
+ "st z+, r16 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r17, r0 \n\t" \
+ "st z+, r17 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r18, r0 \n\t" \
+ "st z+, r18 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r19, r0 \n\t" \
+ "st z+, r19 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r21, r0 \n\t" \
+ "st z+, r21 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r22, r0 \n\t" \
+ "st z+, r22 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r23, r0 \n\t" \
+ "st z+, r23 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r24, r0 \n\t" \
+ "st z+, r24 \n\t" \
+ "adc r28, r25 \n\t" \
+ "adc r29, r25 \n\t" \
+ "bst r28, 0 \n\t" \
+ "lsr r29 \n\t" \
+ "ror r28 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r10, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r11, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r12, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r13, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r14, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r15, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "lsl r28 \n\t" \
+ "bld r28, 0 \n\t" \
+ "rol r29 \n\t" \
+ "rol r10 \n\t" \
+ "rol r11 \n\t" \
+ "rol r12 \n\t" \
+ "rol r13 \n\t" \
+ "rol r14 \n\t" \
+ "rol r15 \n\t" \
+ "ld r0, z \n\t" \
+ "add r28, r0 \n\t" \
+ "st z+, r28 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r29, r0 \n\t" \
+ "st z+, r29 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r10, r0 \n\t" \
+ "st z+, r10 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r11, r0 \n\t" \
+ "st z+, r11 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r12, r0 \n\t" \
+ "st z+, r12 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r13, r0 \n\t" \
+ "st z+, r13 \n\t" \
+ "adc r14, r25 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "mul r2, r2 \n\t" \
+ "mov r16, r0 \n\t" \
+ "mov r17, r1 \n\t" \
+ "mul r3, r3 \n\t" \
+ "mov r18, r0 \n\t" \
+ "mov r19, r1 \n\t" \
+ "mul r4, r4 \n\t" \
+ "mov r21, r0 \n\t" \
+ "mov r22, r1 \n\t" \
+ "mul r5, r5 \n\t" \
+ "mov r23, r0 \n\t" \
+ "mov r24, r1 \n\t" \
+ "add r16, r14 \n\t" \
+ "adc r17, r15 \n\t" \
+ "adc r18, r25 \n\t" \
+ "adc r19, r25 \n\t" \
+ \
+ "mul r7, r5 \n\t" \
+ "mov r14, r0 \n\t" \
+ "mov r15, r1 \n\t" \
+ "mov r28, r25 \n\t" \
+ "mul r8, r4 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r9, r3 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mov r29, r25 \n\t" \
+ "mul r8, r5 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r9, r4 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r2, r3 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mov r10, r25 \n\t" \
+ "mul r9, r5 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r2, r4 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mov r11, r25 \n\t" \
+ "mul r2, r5 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r3, r4 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mov r12, r25 \n\t" \
+ "mul r3, r5 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r4, r5 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ \
+ "lsl r14 \n\t" \
+ "rol r15 \n\t" \
+ "rol r28 \n\t" \
+ "rol r29 \n\t" \
+ "rol r10 \n\t" \
+ "rol r11 \n\t" \
+ "rol r12 \n\t" \
+ "adc r24, r25 \n\t" \
+ "add r16, r14 \n\t" \
+ "adc r17, r15 \n\t" \
+ "adc r18, r28 \n\t" \
+ "adc r19, r29 \n\t" \
+ "adc r21, r10 \n\t" \
+ "adc r22, r11 \n\t" \
+ "adc r23, r12 \n\t" \
+ "adc r24, r25 \n\t" \
+ \
+ "st z+, r16 \n\t" \
+ "st z+, r17 \n\t" \
+ "st z+, r18 \n\t" \
+ "st z+, r19 \n\t" \
+ "st z+, r21 \n\t" \
+ "st z+, r22 \n\t" \
+ "st z+, r23 \n\t" \
+ "st z+, r24 \n\t" \
+ "adiw r26, 4 \n\t"
#define FAST_SQUARE_ASM_24 \
"ldi r25, 0 \n\t" \
@@ -19335,8 +19894,655 @@
"add r23, r0 \n\t" \
"adc r28, r1 \n\t" \
"st z+, r23 \n\t" \
- "st z+, r28 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r28 \n\t"
+
+#define FAST_SQUARE_ASM_24_TO_28 \
+ "cpi r20, 24 \n\t" \
+ "brne 1f \n\t" \
+ "jmp 2f \n\t" \
+ "1: \n\t" \
+ "ld r2, x+ \n\t" \
+ "ld r3, x+ \n\t" \
+ "ld r4, x+ \n\t" \
+ "ld r5, x+ \n\t" \
+ "sbiw r26, 28 \n\t" \
+ "sbiw r30, 24 \n\t" \
+ "ld r6, x+ \n\t" \
+ "ld r7, x+ \n\t" \
+ "ld r8, x+ \n\t" \
+ "ld r9, x+ \n\t" \
+ \
+ "mul r2, r6 \n\t" \
+ "mov r10, r0 \n\t" \
+ "mov r11, r1 \n\t" \
+ "mov r12, r25 \n\t" \
+ "mov r13, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ \
+ "mov r14, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ \
+ "mov r15, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r16, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r17, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r18, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r19, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r21, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r22, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r23, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r24, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r28, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r29, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ \
+ "lsl r10 \n\t" \
+ "rol r11 \n\t" \
+ "rol r12 \n\t" \
+ "rol r13 \n\t" \
+ "rol r14 \n\t" \
+ "rol r15 \n\t" \
+ "rol r16 \n\t" \
+ "rol r17 \n\t" \
+ "rol r18 \n\t" \
+ "rol r19 \n\t" \
+ "rol r21 \n\t" \
+ "rol r22 \n\t" \
+ "rol r23 \n\t" \
+ "rol r24 \n\t" \
+ "rol r28 \n\t" \
+ "rol r29 \n\t" \
+ "ld r0, z \n\t" \
+ "add r10, r0 \n\t" \
+ "st z+, r10 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r11, r0 \n\t" \
+ "st z+, r11 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r12, r0 \n\t" \
+ "st z+, r12 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r13, r0 \n\t" \
+ "st z+, r13 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r14, r0 \n\t" \
+ "st z+, r14 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r15, r0 \n\t" \
+ "st z+, r15 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r16, r0 \n\t" \
+ "st z+, r16 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r17, r0 \n\t" \
+ "st z+, r17 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r18, r0 \n\t" \
+ "st z+, r18 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r19, r0 \n\t" \
+ "st z+, r19 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r21, r0 \n\t" \
+ "st z+, r21 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r22, r0 \n\t" \
+ "st z+, r22 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r23, r0 \n\t" \
+ "st z+, r23 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r24, r0 \n\t" \
+ "st z+, r24 \n\t" \
+ "adc r28, r25 \n\t" \
+ "adc r29, r25 \n\t" \
+ "bst r28, 0 \n\t" \
+ "lsr r29 \n\t" \
+ "ror r28 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r10, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r11, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r12, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r13, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r14, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r15, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r16, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r17, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r18, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r19, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ \
+ "lsl r28 \n\t" \
+ "bld r28, 0 \n\t" \
+ "rol r29 \n\t" \
+ "rol r10 \n\t" \
+ "rol r11 \n\t" \
+ "rol r12 \n\t" \
+ "rol r13 \n\t" \
+ "rol r14 \n\t" \
+ "rol r15 \n\t" \
+ "rol r16 \n\t" \
+ "rol r17 \n\t" \
+ "rol r18 \n\t" \
+ "rol r19 \n\t" \
+ "ld r0, z \n\t" \
+ "add r28, r0 \n\t" \
+ "st z+, r28 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r29, r0 \n\t" \
+ "st z+, r29 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r10, r0 \n\t" \
+ "st z+, r10 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r11, r0 \n\t" \
+ "st z+, r11 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r12, r0 \n\t" \
+ "st z+, r12 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r13, r0 \n\t" \
+ "st z+, r13 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r14, r0 \n\t" \
+ "st z+, r14 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r15, r0 \n\t" \
+ "st z+, r15 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r16, r0 \n\t" \
+ "st z+, r16 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r17, r0 \n\t" \
+ "st z+, r17 \n\t" \
+ "adc r18, r25 \n\t" \
+ "adc r19, r25 \n\t" \
+ \
+ "mul r2, r2 \n\t" \
+ "mov r21, r0 \n\t" \
+ "mov r22, r1 \n\t" \
+ "mul r3, r3 \n\t" \
+ "mov r23, r0 \n\t" \
+ "mov r24, r1 \n\t" \
+ "mul r4, r4 \n\t" \
+ "mov r28, r0 \n\t" \
+ "mov r29, r1 \n\t" \
+ "mul r5, r5 \n\t" \
+ "mov r10, r0 \n\t" \
+ "mov r11, r1 \n\t" \
+ "add r21, r18 \n\t" \
+ "adc r22, r19 \n\t" \
+ "adc r23, r25 \n\t" \
+ "adc r24, r25 \n\t" \
+ \
+ "mul r7, r5 \n\t" \
+ "mov r18, r0 \n\t" \
+ "mov r19, r1 \n\t" \
+ "mov r12, r25 \n\t" \
+ "mul r8, r4 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r9, r3 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mov r13, r25 \n\t" \
+ "mul r8, r5 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r9, r4 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r2, r3 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mov r14, r25 \n\t" \
+ "mul r9, r5 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r2, r4 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mov r15, r25 \n\t" \
+ "mul r2, r5 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r3, r4 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mov r16, r25 \n\t" \
+ "mul r3, r5 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r4, r5 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ \
+ "lsl r18 \n\t" \
+ "rol r19 \n\t" \
+ "rol r12 \n\t" \
+ "rol r13 \n\t" \
+ "rol r14 \n\t" \
+ "rol r15 \n\t" \
+ "rol r16 \n\t" \
+ "adc r11, r25 \n\t" \
+ "add r21, r18 \n\t" \
+ "adc r22, r19 \n\t" \
+ "adc r23, r12 \n\t" \
+ "adc r24, r13 \n\t" \
+ "adc r28, r14 \n\t" \
+ "adc r29, r15 \n\t" \
+ "adc r10, r16 \n\t" \
+ "adc r11, r25 \n\t" \
+ \
+ "st z+, r21 \n\t" \
+ "st z+, r22 \n\t" \
+ "st z+, r23 \n\t" \
+ "st z+, r24 \n\t" \
+ "st z+, r28 \n\t" \
+ "st z+, r29 \n\t" \
+ "st z+, r10 \n\t" \
+ "st z+, r11 \n\t" \
+ "adiw r26, 4 \n\t"
#define FAST_SQUARE_ASM_28 \
"ldi r25, 0 \n\t" \
@@ -21538,8 +22744,747 @@
"add r23, r0 \n\t" \
"adc r28, r1 \n\t" \
"st z+, r23 \n\t" \
- "st z+, r28 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r28 \n\t"
+
+#define FAST_SQUARE_ASM_28_TO_32 \
+ "cpi r20, 28 \n\t" \
+ "brne 1f \n\t" \
+ "jmp 2f \n\t" \
+ "1: \n\t" \
+ "ld r2, x+ \n\t" \
+ "ld r3, x+ \n\t" \
+ "ld r4, x+ \n\t" \
+ "ld r5, x+ \n\t" \
+ "sbiw r26, 32 \n\t" \
+ "sbiw r30, 28 \n\t" \
+ "ld r6, x+ \n\t" \
+ "ld r7, x+ \n\t" \
+ "ld r8, x+ \n\t" \
+ "ld r9, x+ \n\t" \
+ \
+ "mul r2, r6 \n\t" \
+ "mov r10, r0 \n\t" \
+ "mov r11, r1 \n\t" \
+ "mov r12, r25 \n\t" \
+ "mov r13, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ \
+ "mov r14, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ \
+ "mov r15, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r16, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r17, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r18, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r19, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r21, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r22, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r23, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r24, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r28, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r28, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r29, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r28, r1 \n\t" \
+ "adc r29, r25 \n\t" \
+ \
+ "lsl r10 \n\t" \
+ "rol r11 \n\t" \
+ "rol r12 \n\t" \
+ "rol r13 \n\t" \
+ "rol r14 \n\t" \
+ "rol r15 \n\t" \
+ "rol r16 \n\t" \
+ "rol r17 \n\t" \
+ "rol r18 \n\t" \
+ "rol r19 \n\t" \
+ "rol r21 \n\t" \
+ "rol r22 \n\t" \
+ "rol r23 \n\t" \
+ "rol r24 \n\t" \
+ "rol r28 \n\t" \
+ "rol r29 \n\t" \
+ "ld r0, z \n\t" \
+ "add r10, r0 \n\t" \
+ "st z+, r10 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r11, r0 \n\t" \
+ "st z+, r11 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r12, r0 \n\t" \
+ "st z+, r12 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r13, r0 \n\t" \
+ "st z+, r13 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r14, r0 \n\t" \
+ "st z+, r14 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r15, r0 \n\t" \
+ "st z+, r15 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r16, r0 \n\t" \
+ "st z+, r16 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r17, r0 \n\t" \
+ "st z+, r17 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r18, r0 \n\t" \
+ "st z+, r18 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r19, r0 \n\t" \
+ "st z+, r19 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r21, r0 \n\t" \
+ "st z+, r21 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r22, r0 \n\t" \
+ "st z+, r22 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r23, r0 \n\t" \
+ "st z+, r23 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r24, r0 \n\t" \
+ "st z+, r24 \n\t" \
+ "adc r28, r25 \n\t" \
+ "adc r29, r25 \n\t" \
+ "bst r28, 0 \n\t" \
+ "lsr r29 \n\t" \
+ "ror r28 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r10, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r28, r0 \n\t" \
+ "adc r29, r1 \n\t" \
+ "adc r10, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r11, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r29, r0 \n\t" \
+ "adc r10, r1 \n\t" \
+ "adc r11, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r12, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r10, r0 \n\t" \
+ "adc r11, r1 \n\t" \
+ "adc r12, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r13, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r11, r0 \n\t" \
+ "adc r12, r1 \n\t" \
+ "adc r13, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r14, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r12, r0 \n\t" \
+ "adc r13, r1 \n\t" \
+ "adc r14, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r15, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r13, r0 \n\t" \
+ "adc r14, r1 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r16, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r14, r0 \n\t" \
+ "adc r15, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r17, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r15, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r18, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r19, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ \
+ "ld r6, x+ \n\t" \
+ "mov r21, r25 \n\t" \
+ "mul r2, r6 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r3, r9 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r8 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r5, r7 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ \
+ "ld r7, x+ \n\t" \
+ "mov r22, r25 \n\t" \
+ "mul r2, r7 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r3, r6 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r4, r9 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ "mul r5, r8 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ "adc r22, r25 \n\t" \
+ \
+ "ld r8, x+ \n\t" \
+ "mov r23, r25 \n\t" \
+ "mul r2, r8 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r3, r7 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r4, r6 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ "mul r5, r9 \n\t" \
+ "add r21, r0 \n\t" \
+ "adc r22, r1 \n\t" \
+ "adc r23, r25 \n\t" \
+ \
+ "ld r9, x+ \n\t" \
+ "mov r24, r25 \n\t" \
+ "mul r2, r9 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r3, r8 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r4, r7 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ "mul r5, r6 \n\t" \
+ "add r22, r0 \n\t" \
+ "adc r23, r1 \n\t" \
+ "adc r24, r25 \n\t" \
+ \
+ "lsl r28 \n\t" \
+ "bld r28, 0 \n\t" \
+ "rol r29 \n\t" \
+ "rol r10 \n\t" \
+ "rol r11 \n\t" \
+ "rol r12 \n\t" \
+ "rol r13 \n\t" \
+ "rol r14 \n\t" \
+ "rol r15 \n\t" \
+ "rol r16 \n\t" \
+ "rol r17 \n\t" \
+ "rol r18 \n\t" \
+ "rol r19 \n\t" \
+ "rol r21 \n\t" \
+ "rol r22 \n\t" \
+ "rol r23 \n\t" \
+ "rol r24 \n\t" \
+ "ld r0, z \n\t" \
+ "add r28, r0 \n\t" \
+ "st z+, r28 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r29, r0 \n\t" \
+ "st z+, r29 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r10, r0 \n\t" \
+ "st z+, r10 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r11, r0 \n\t" \
+ "st z+, r11 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r12, r0 \n\t" \
+ "st z+, r12 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r13, r0 \n\t" \
+ "st z+, r13 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r14, r0 \n\t" \
+ "st z+, r14 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r15, r0 \n\t" \
+ "st z+, r15 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r16, r0 \n\t" \
+ "st z+, r16 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r17, r0 \n\t" \
+ "st z+, r17 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r18, r0 \n\t" \
+ "st z+, r18 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r19, r0 \n\t" \
+ "st z+, r19 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r21, r0 \n\t" \
+ "st z+, r21 \n\t" \
+ "ld r0, z \n\t" \
+ "adc r22, r0 \n\t" \
+ "st z+, r22 \n\t" \
+ "adc r23, r25 \n\t" \
+ "adc r24, r25 \n\t" \
+ \
+ "mul r2, r2 \n\t" \
+ "mov r28, r0 \n\t" \
+ "mov r29, r1 \n\t" \
+ "mul r3, r3 \n\t" \
+ "mov r10, r0 \n\t" \
+ "mov r11, r1 \n\t" \
+ "mul r4, r4 \n\t" \
+ "mov r12, r0 \n\t" \
+ "mov r13, r1 \n\t" \
+ "mul r5, r5 \n\t" \
+ "mov r14, r0 \n\t" \
+ "mov r15, r1 \n\t" \
+ "add r28, r23 \n\t" \
+ "adc r29, r24 \n\t" \
+ "adc r10, r25 \n\t" \
+ "adc r11, r25 \n\t" \
+ \
+ "mul r7, r5 \n\t" \
+ "mov r23, r0 \n\t" \
+ "mov r24, r1 \n\t" \
+ "mov r16, r25 \n\t" \
+ "mul r8, r4 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mul r9, r3 \n\t" \
+ "add r23, r0 \n\t" \
+ "adc r24, r1 \n\t" \
+ "adc r16, r25 \n\t" \
+ "mov r17, r25 \n\t" \
+ "mul r8, r5 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r9, r4 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mul r2, r3 \n\t" \
+ "add r24, r0 \n\t" \
+ "adc r16, r1 \n\t" \
+ "adc r17, r25 \n\t" \
+ "mov r18, r25 \n\t" \
+ "mul r9, r5 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mul r2, r4 \n\t" \
+ "add r16, r0 \n\t" \
+ "adc r17, r1 \n\t" \
+ "adc r18, r25 \n\t" \
+ "mov r19, r25 \n\t" \
+ "mul r2, r5 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mul r3, r4 \n\t" \
+ "add r17, r0 \n\t" \
+ "adc r18, r1 \n\t" \
+ "adc r19, r25 \n\t" \
+ "mov r21, r25 \n\t" \
+ "mul r3, r5 \n\t" \
+ "add r18, r0 \n\t" \
+ "adc r19, r1 \n\t" \
+ "adc r21, r25 \n\t" \
+ "mul r4, r5 \n\t" \
+ "add r19, r0 \n\t" \
+ "adc r21, r1 \n\t" \
+ \
+ "lsl r23 \n\t" \
+ "rol r24 \n\t" \
+ "rol r16 \n\t" \
+ "rol r17 \n\t" \
+ "rol r18 \n\t" \
+ "rol r19 \n\t" \
+ "rol r21 \n\t" \
+ "adc r15, r25 \n\t" \
+ "add r28, r23 \n\t" \
+ "adc r29, r24 \n\t" \
+ "adc r10, r16 \n\t" \
+ "adc r11, r17 \n\t" \
+ "adc r12, r18 \n\t" \
+ "adc r13, r19 \n\t" \
+ "adc r14, r21 \n\t" \
+ "adc r15, r25 \n\t" \
+ \
+ "st z+, r28 \n\t" \
+ "st z+, r29 \n\t" \
+ "st z+, r10 \n\t" \
+ "st z+, r11 \n\t" \
+ "st z+, r12 \n\t" \
+ "st z+, r13 \n\t" \
+ "st z+, r14 \n\t" \
+ "st z+, r15 \n\t" \
+ "adiw r26, 4 \n\t"
#define FAST_SQUARE_ASM_32 \
"ldi r25, 0 \n\t" \
@@ -24361,7 +26306,6 @@
"add r23, r0 \n\t" \
"adc r28, r1 \n\t" \
"st z+, r23 \n\t" \
- "st z+, r28 \n\t" \
- "eor r1, r1 \n\t"
+ "st z+, r28 \n\t"
#endif /* _UECC_ASM_AVR_MULT_SQUARE_H_ */