blob: 3b190c9280aeb69bc7a169972b125b8f991b3ceb [file] [log] [blame]
.align 2
.global square256_asm
.type square256_asm, %function
square256_asm:
push {r4-r7,lr}
mov r2, r8
mov r3, r9
mov r4, r10
mov r5, r11
push {r0-r5}
mov r12, r0
mov r4, r1
ldm r4!, {r0-r3}
push {r4}
/////////BEGIN LOW PART //////////////////////
///SQR 128, in r0-r3
mov r8, r2
mov r9, r3
eor r4, r4
sub r2, r0
sbc r3, r1
sbc r4, r4
eor r2, r4
eor r3, r4
sub r2, r4
sbc r3, r4
mov r10, r2
mov r11, r3
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r7, r7
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r7, r7
add r1, r0
adc r2, r3
adc r7, r3
mov r3, r12
stm r3!, {r0-r1}
push {r3}
mov r12, r0
mov r0, r8
mov r8, r1
mov r1, r9
mov r9, r2
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r4, r4
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r4, r4
add r1, r0
adc r2, r3
adc r3, r4
eor r4, r4
mov r6, r9
add r0, r6
adc r7, r1
adc r2, r4
adc r3, r4
mov r1, r11
mov r11, r0
mov r0, r10
mov r9, r2
mov r10,r3
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r4, r4
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r4, r4
add r1, r0
adc r2, r3
adc r3, r4
mov r6, r11
mov r4, r11
mov r5, r7
sub r6, r0
sbc r7, r1
sbc r4, r2
sbc r5, r3
eor r1, r1
sbc r1, r1
mov r2, r12
mov r3, r8
add r2, r6
adc r3, r7
mov r6, r9
mov r7, r10
adc r4, r6
adc r5, r7
adc r6, r1
adc r7, r1
//results r12, r8, r2-r7
/////////END LOW PART ////////////////////////
pop {r0,r1}
stm r0!, {r2, r3}
push {r0, r4-r7}
ldm r1, {r0-r3}
/////////BEGIN HIGH PART //////////////////////
///SQR 128, in r0-r3
mov r8, r2
mov r9, r3
eor r4, r4
sub r2, r0
sbc r3, r1
sbc r4, r4
eor r2, r4
eor r3, r4
sub r2, r4
sbc r3, r4
mov r10, r2
mov r11, r3
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r7, r7
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r7, r7
add r1, r0
adc r2, r3
adc r7, r3
mov r12, r0
mov r0, r8
mov r8, r1
mov r1, r9
mov r9, r2
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r4, r4
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r4, r4
add r1, r0
adc r2, r3
adc r3, r4
eor r4, r4
mov r6, r9
add r0, r6
adc r7, r1
adc r2, r4
adc r3, r4
mov r1, r11
mov r11, r0
mov r0, r10
mov r9, r2
mov r10,r3
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r4, r4
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r4, r4
add r1, r0
adc r2, r3
adc r3, r4
mov r6, r11
mov r4, r11
mov r5, r7
sub r6, r0
sbc r7, r1
sbc r4, r2
sbc r5, r3
eor r1, r1
sbc r1, r1
mov r2, r12
mov r3, r8
add r2, r6
adc r3, r7
mov r6, r9
mov r7, r10
adc r4, r6
adc r5, r7
adc r6, r1
adc r7, r1
//results r12, r8, r2-r7
/////////END HIGH PART ////////////////////////
mov r0, r12
mov r1, r8
mov r8, r4
mov r9, r5
mov r10, r6
mov r11, r7
pop {r4}
mov r12, r4//str
pop {r4-r7}
add r0, r4
adc r1, r5
adc r2, r6
adc r3, r7
mov r4, r12
stm r4!, {r0-r3}//low part
mov r4, r8
mov r5, r9
mov r6, r10
mov r7, r11
eor r0, r0
adc r4, r0
adc r5, r0
adc r6, r0
adc r7, r0
pop {r0, r1} //r0->out, r1, in
push {r0,r4-r7}
ldm r1, {r0-r7}
sub r0, r4
sbc r1, r5
sbc r2, r6
sbc r3, r7
sbc r4, r4
eor r0, r4
eor r1, r4
eor r2, r4
eor r3, r4
sub r0, r4
sbc r1, r4
sbc r2, r4
sbc r3, r4
//////////BEGIN MIDDLE PART////////////////
///SQR 128, in r0-r3
mov r8, r2
mov r9, r3
eor r4, r4
sub r2, r0
sbc r3, r1
sbc r4, r4
eor r2, r4
eor r3, r4
sub r2, r4
sbc r3, r4
mov r10, r2
mov r11, r3
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r7, r7
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r7, r7
add r1, r0
adc r2, r3
adc r7, r3
mov r12, r0
mov r0, r8
mov r8, r1
mov r1, r9
mov r9, r2
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r4, r4
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r4, r4
add r1, r0
adc r2, r3
adc r3, r4
eor r4, r4
mov r6, r9
add r0, r6
adc r7, r1
adc r2, r4
adc r3, r4
mov r1, r11
mov r11, r0
mov r0, r10
mov r9, r2
mov r10,r3
//SQR64, in: r0, r1, out: r0-r3, used: r0-r6
mov r2, r0
eor r3, r3
sub r2, r1
sbc r3, r3
eor r2, r3
sub r2, r3
lsr r3, r0, #16
uxth r0, r0
mov r4, r0
mul r4, r3
mul r0, r0
mul r3, r3
lsr r5, r4, #16
lsl r4, #16
add r0, r4
adc r3, r5
add r0, r4
adc r3, r5
lsr r4, r1, #16
uxth r1, r1
mov r5, r1
mul r5, r4
mul r1, r1
mul r4, r4
eor r6, r6
add r1, r3
adc r4, r6
lsr r3, r5, #16
lsl r5, r5, #16
add r1, r5
adc r4, r3
add r1, r5
adc r3, r4
lsr r4, r2, #16
uxth r2, r2
mov r5, r2
mul r5, r4
mul r2, r2
mul r4, r4
lsr r6, r5, #16
lsl r5, #16
add r2, r5
adc r4, r6
add r5, r2
adc r6, r4
eor r4, r4
mov r2, r1
sub r1, r5
sbc r2, r6
sbc r4, r4
add r1, r0
adc r2, r3
adc r3, r4
mov r6, r11
mov r4, r11
mov r5, r7
sub r6, r0
sbc r7, r1
sbc r4, r2
sbc r5, r3
eor r1, r1
sbc r1, r1
mov r2, r12
mov r3, r8
add r2, r6
adc r3, r7
mov r6, r9
mov r7, r10
adc r4, r6
adc r5, r7
adc r6, r1
adc r7, r1
//results r12, r8, r2-r7
//////////END MIDDLE PART//////////////////
mvn r2, r2
mvn r3, r3
mvn r4, r4
mvn r5, r5
mvn r6, r6
mvn r7, r7
pop {r1}
push {r4-r7}
mov r4, #1
asr r4, #1
ldm r1!, {r4-r7}
mov r0, r12
mov r12, r1 ////////ref
mov r1, r8
mvn r0, r0
mvn r1, r1
adc r0, r4
adc r1, r5
adc r2, r6
adc r3, r7
eor r4, r4
adc r4, r4
mov r8, r4 //carry A --ini
mov r4, r12
ldm r4, {r4-r7}
add r0, r4
adc r1, r5
adc r2, r6
adc r3, r7
mov r9, r4
mov r4, r12
stm r4!, {r0-r3}
mov r12, r4
mov r4, r9
pop {r0-r3}
adc r4, r0
adc r5, r1
adc r6, r2
adc r7, r3
eor r0, r0
adc r0, r0
mov r9, r0 //carry B --ini
mov r0, r8
asr r0, #1 //carry A --end
pop {r0-r3}
adc r4, r0
adc r5, r1
adc r6, r2
adc r7, r3
mov r8, r0
mov r0, r12
stm r0!, {r4-r7}
mov r11, r0
mov r0, r8
eor r4, r4
mov r5, r9
adc r5, r4 //carry B --end
mvn r6, r4
add r5, r6
adc r6, r4
add r0, r5
adc r1, r6
adc r2, r6
adc r3, r6
mov r7, r11
stm r7!, {r0-r3}
pop {r3-r6}
mov r8, r3
mov r9, r4
mov r10, r5
mov r11, r6
pop {r4-r7,pc}
bx lr
.size square256_asm, .-square256_asm