blob: bb41975aa92da31fa03ecee87b0603413a3c3848 [file] [log] [blame]
/*
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#if !PICO_RP2040
#include "pico/asm_helper.S"
pico_default_asm_setup
.macro double_section name
#if PICO_DOUBLE_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
.macro double_wrapper_section func
double_section WRAPPER_FUNC_NAME(\func)
.endm
.global rtwopi
.global logtab0
.global exptab0
.global exptab1
.global exptab2
.global trigtab
@ load a 32-bit constant n into register rx
.macro movlong rx,n
movw \rx,#(\n)&0xffff
movt \rx,#((\n)>>16)&0xffff
.endm
double_section rtwopi
// 1/2π to plenty of accuracy, 256 bits per line
.long 0 @ this allows values of e down to -32
rtwopi:
.long 0,0
.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410, 0x7F9458EA, 0xF7AEF158
.long 0x6DC91B8E, 0x909374B8, 0x01924BBA, 0x82746487, 0x3F877AC7, 0x2C4A69CF, 0xBA208D7D, 0x4BAED121
.long 0x3A671C09, 0xAD17DF90, 0x4E64758E, 0x60D4CE7D, 0x272117E2, 0xEF7E4A0E, 0xC7FE25FF, 0xF7816603
.long 0xFBCBC462, 0xD6829B47, 0xDB4D9FB3, 0xC9F2C26D, 0xD3D18FD9, 0xA797FA8B, 0x5D49EEB1, 0xFAF97C5E
.long 0xCF41CE7D, 0xE294A4BA, 0x9AFED7EC, 0x47E35742, 0x1580CC11
double_section drrcore
@ input:
@ r0:r1 mantissa m Q52
@ r2 exponent e>=-32, typically offset by +12
@ r3 pointer to rtwopi table
@ output:
@ r0..r2 preserved
@ r7:r8 range reduced result
@ r3,r4,r5,r6,r9,r10 trashed
.thumb_func
drr_core:
ldr r3,=rtwopi
and r5,r2,#31 @ s=e%32
mov r7,#1
lsls r7,r7,r5 @ 1<<s
asrs r8,r2,#5 @ k=e/32, k<=32 for double with e offset <32; with e offsets up to 12+64, k<=34
umull r4,r5,r7,r0
movs r6,#0
umlal r5,r6,r7,r1
@ r2 e
@ r4:r5:r6 u0:u1:u2 = m<<(e%32); u2 is never more than 2<<20
@ r8 e/32
add r3,r3,r8,lsl#2 @ p
ldr r9,[r3,#16] @ a0=p[4]
umull r10,r7,r9,r6 @ r0=a0*u2 hi, discard lo
@ r7 r0
ldr r9,[r3,#12] @ a1=p[3]
umull r10,r8,r9,r5 @ r1=a1*u1 hi, discard lo
umaal r7,r8,r9,r6 @ r0:r1=r0+r1+a1*u2
@ r7:r8 r0:r1
ldr r9,[r3,#8] @ a2=p[2]
mla r8,r9,r6,r8 @ r1+=a2*u2
umlal r7,r8,r9,r5 @ r0:r1+=a2*u1
umull r10,r6,r9,r4 @ r0x=a2*u0 hi, discard lo; u2 no longer needed
@ r7:r8 r0:r1
@ r6 r0x
ldr r9,[r3,#4] @ a3=p[1]
mla r8,r9,r5,r8 @ r1+=a3*u1
umlal r7,r8,r9,r4 @ r0:r1+=a3*u0
@ r7:r8 r0:r1
@ r6 r0x
ldr r9,[r3,#0] @ a4=p[0]
mla r8,r9,r4,r8 @ r1+=a4*u0
@ r7:r8 r0:r1
@ r6 r0x
adds r7,r6
adc r8, r8, #0
bx r14
double_wrapper_section tan
wrapper_func tan
push {r14}
bl sincos_raw
bl __aeabi_ddiv
pop {r15}
double_wrapper_section sin_cos
10: @ process Inf/NaN for dsin and dcos
orrs r2,r0,r1,lsl#12
it eq
orreq r1,#0x80000000
orr r1,#0x00080000
movs r3,r1 @ copy results to cosine output
movs r2,r0
bx r14
@ case where angle is very small (<2^-32) before reduction
32:
cbnz r3,33f
movs r0,#0 @ flush denormal
movs r1,#0
33:
movs r2,#0 @ return x for sine, 1 for cosine
movlong r3,0x3ff00000
tst r12,#2
itt ne
movne r0,r2 @ calculating cosine? move to result registers
movne r1,r3
bx r14
@ case where angle is fairly small after reduction
30:
movs r2,#0
movs r3,#0
movs r4,#0
movs r5,#0x80000000
b 31f
40:
@ case where range-reduced angle is very small
@ here
@ r0:r1 mantissa
@ r2 exponent+12
@ r7:r8 abs range-reduced angle < 2^-10 Q64
@ r12b31: dsincos flag
@ r12b30: original argument sign
@ r12b2..1: quadrant count
@ r12b0: sign of reduced angle
push {r12}
movs r12,#0
2:
add r12,#4
add r2,#4
bl drr_core @ repeat range reduction with extra factor of 2^4 (, 2^8, 2^12, 2^16,...)
eors r9,r8,r8,asr#31 @ we want to keep the sign in r8b31 for later
cmp r9,#1<<24 @ ≥ 2^-8?
bhs 1f @ loop until the result is big enough
cmp r12,#64 @ safety net
bne 2b
1:
eors r7,r7,r8,asr#31
@ here r7:r9 is the abs range-reduced angle Q64+r12, 2^-8..2^-4 in Q64 terms
@ 2π=6.487ED511 0B4611A6 (2633...)
movlong r6,0x0B4611A6 @ 2π Q64 low fractional word
umull r10,r0,r9,r6
movlong r6,0x487ED511 @ 2π Q64 high fractional word
umull r10,r1,r7,r6
umaal r0,r1,r9,r6
movs r6,#6 @ 2π integer part
umlal r0,r1,r7,r6
mla r1,r9,r6,r1
@ here
@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12
@ r8b31: sign of reduced angle
@ r12: excess exponent ≥4, multiple of 4
@ r0:r1 / 2^r12 < ~ 2π * 2^-10 so for sin we need to go to term in x^5
rsbs r10,r12,#32
bmi 1f @ very small result?
lsr r3,r1,r12
lsr r2,r0,r12
lsl r9,r1,r10
orr r2,r9
@ r2:r3 θ Q64 (with r12 bits of loss of precision)
umull r9,r4,r2,r3
umull r9,r5,r2,r3
umaal r4,r5,r3,r3
@ r4:r5 θ² Q64 = θ²/2 Q65 < ~ 4π² * 2^-20
umull r9,r6,r4,r5
umull r9,r7,r4,r5
umaal r6,r7,r5,r5
@ r6:r7 θ⁴ Q64 < ~ 16π⁴ * 2^-40 < 2^-29
lsrs r6,#3
orrs r6,r6,r7,lsl#29
@ r6 θ⁴ Q61
mov r9,#0xaaaaaaaa @ 2/3 Q32
umull r6,r7,r6,r9
@ r7 θ⁴ * 2/3 Q61 = θ⁴/24 Q65
rsbs r6,r4,r7
sbcs r7,r5,r5,lsl#1
@ r6:r7 - θ²/2 + θ⁴/24 Q65, <=0
asrs r9,r7,#12
lsrs r6,#12
adcs r6,r6,r7,lsl#20
adcs r7,r9,#0x0ff00000
adds r7,#0x30000000
push {r6,r7} @ packed cos θ
@ here
@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12
@ r4:r5 θ² Q64 = θ²/2 Q65
umull r9,r2,r0,r5
umull r9,r3,r1,r4
umaal r2,r3,r1,r5
@ r2:r3 θ³ Q64+r12
umull r9,r6,r2,r5
umull r9,r7,r3,r4
umaal r6,r7,r3,r5
@ r6:r7 θ⁵ Q64+r12; in fact r7 is always 0
mov r9,#0xaaaaaaaa @ 2/3 Q32
umull r10,r4,r2,r9
movs r5,#0
umlal r4,r5,r3,r9
adds r4,r4,r5
adcs r5,r5,#0
@ r4:r5 θ³*2/3 Q64+r12 = θ³/6 Q66+r12
mov r9,#0x88888888 @ 8/15 Q32
umull r2,r3,r6,r9
@ r3 θ⁵*8/15 Q64+r12 = θ⁵/120 Q70+r12
subs r4,r4,r3,lsr#4
sbc r5,r5,#0
@ r4:r5 θ³/6-θ⁵/120 Q66+r12
subs r0,r0,r4,lsr#2
sbc r1,r1,r5,asr#2
subs r0,r0,r5,lsl#30
sbc r1,r1,#0
@ r0:r1 θ-θ³/6+θ⁵/120 Q64+r12
rsb r4,r12,#0x400
sub r4,#14
bl dpack_q
pop {r6,r7,r12} @ get cosine, flags
b 2f
1:
@ case where range reduction result has excess exponent ≥ 32
@ so sin x=x, cos x=1
rsb r4,r12,#0x400
sub r4,#14
bl dpack_q
pop {r12} @ get flags
movs r6,#0
movlong r7,0x3ff00000
2:
@ here r0:r1 is packed sine, r6:r7 is packed cosine
lsrs r8,#31
bfi r12,r8,#0,#1
asrs r9,r12,#1
bmi 23f @ doing dsincos?
asrs r9,#1
bcc 21f @ need sine?
@ need cosine:
ands r12,#4
orrs r1,r7,r12,lsl#29 @ insert sign
movs r0,r6
pop {r4-r10,r15}
21:
eors r12,r12,r12,lsr#2
orrs r1,r1,r12,lsl#31 @ insert sign
pop {r4-r10,r15}
23:
ands r4,r12,#4
eors r7,r7,r4,lsl#29 @ insert sign
push {r6,r7}
b 20f
@ sincos à la GNU with pointers to where to put results
wrapper_func sincos
push {r2-r5, lr}
bl sincos_raw
pop {r4-r5}
stmia r4!, {r0, r1}
stmia r5!, {r2, r3}
pop {r4, r5, pc}
@ sincos with results in r0:r1 and r2:r3
.thumb_func
sincos_raw:
ands r12,r1,#1<<31
lsrs r12,#1 @ save argument sign in r12b30
orrs r12,r12,#1<<31 @ flag we want both results in r12b31
b 1f
wrapper_func sin
lsrs r12,r1,#29 @ negative argument -> +2 quadrants
ands r12,#4
b 1f
wrapper_func cos
movs r12,#2 @ cos -> +1 quadrant
1:
ubfx r3,r1,#20,#11 @ get exponent
sub r2,r3,#0x3ff
cmp r2,#0x400
beq 10b @ Inf or NaN?
cmn r2,#32
blt 32b @ very small argument?
movs r3,#1
bfi r1,r3,#20,#12 @ fix implied 1 in mantissa
push {r4-r10,r14}
add r2,#12 @ e+12
bl drr_core
@ r7:r8 θ/2π 0..1 Q64
lsrs r4,r8,#30 @ quadrant count
adc r4,r4,#0 @ rounded
sub r8,r8,r4,lsl#30 @ now -0.125≤r7:r8<+0.125 Q64
add r12,r12,r4,lsl#1
orr r12,r12,r8,lsr#31 @ sign into r12b0
@ r12b2..1: quadrant count
@ r12b0: sign of reduced angle
eors r7,r7,r8,asr#31
eors r8,r8,r8,asr#31 @ absolute value of reduced angle 0≤r7:r8<0.125 Q64
cmp r8,#1<<22 @ < 2^-10?
blo 40b
@ 2π=6.487ED511 0B4611A6 (2633...)
movlong r9,0x0B4611A6 @ 2π Q64 low fractional word
umull r10,r0,r8,r9
movlong r9,0x487ED511 @ 2π Q64 high fractional word
umull r10,r1,r7,r9
umaal r0,r1,r8,r9
movs r9,#6 @ 2π integer part
umlal r0,r1,r7,r9
mla r1,r8,r9,r1
@ r0:r1 range reduced angle θ 0..π/4 Q64
cmp r1,#1<<25 @ θ < 2^-7?
blo 30b
lsrs r2,r1,#27
ldr r3,=trigtab
add r3,r3,r2,lsl#4
ldmia r3,{r2-r5}
31:
subs r0,r0,r2
sbcs r1,r1,r3 @ ε=Θ-φ Q64
bmi 2f @ ε negative?
asrs r6,r12,#1
bmi 5f @ doing dsincos?
asrs r6,#1
bcs 3f @ need cosine?
@ here ε is positive, we need the sin of θ and the sign of the result is r12b0^r12b2
bl dsc_h0
bl dsc_h0s
bl dpack_q63
21:
eors r12,r12,r12,lsr#2
orrs r1,r1,r12,lsl#31 @ insert sign
pop {r4-r10,r15}
2:
asrs r6,r12,#1
bmi 6f @ doing dsincos?
asrs r6,#1
bcs 4f @ need cosine?
@ here ε is negative, we need the sin of θ and the sign of the result is r12b0^r12b2
bl dsc_h1
bl dsc_h1s
bl dnegpack_q63
eors r12,r12,r12,lsr#2
orrs r1,r1,r12,lsl#31 @ insert sign
pop {r4-r10,r15}
@ here ε is positive, we need the cos of θ and the sign of the result is r12b2
3:
bl dsc_h0
bl dsc_h0c
bl dnegpack_q63
22:
ands r12,#4
orrs r1,r1,r12,lsl#29 @ insert sign
pop {r4-r10,r15}
@ here ε is negative, we need the cos of θ and the sign of the result is r12b2
4:
bl dsc_h1
bl dsc_h1c
15:
bl dpack_q63
ands r12,#4
orrs r1,r1,r12,lsl#29 @ insert sign
pop {r4-r10,r15}
5:
@ dsincos, ε positive
bl dsc_h0
push {r2-r7}
bl dsc_h0c
bl dnegpack_q63
ands r4,r12,#4
eors r1,r1,r4,lsl#29 @ negate cosine in quadrants 2 and 3
pop {r2-r7}
push {r0,r1}
bl dsc_h0s
bl dpack_q63
20:
eors r4,r12,r12,lsr#1
eors r4,r4,r12,lsr#2
eors r1,r1,r4,lsl#31 @ negate sine on b0^b1^b2
tst r12,#2 @ exchange sine and cosine in odd quadrants
ittte ne
movne r2,r0
movne r3,r1
popne {r0,r1}
popeq {r2,r3}
ands r4,r12,#1<<30
eors r1,r1,r4,lsl#1 @ negate sine result if argument was negative
pop {r4-r10,r15}
6:
@ dsincos, ε negative
bl dsc_h1
push {r2-r7}
bl dsc_h1c
bl dpack_q63
ands r4,r12,#4
eors r1,r1,r4,lsl#29 @ negate cosine in quadrants 2 and 3
pop {r2-r7}
push {r0,r1}
bl dsc_h1s
bl dnegpack_q63
b 20b
@ sin/cos power series for negative ε
dsc_h1:
rsbs r0,r0,#0
sbc r1,r1,r1,lsl#1
@ drop into positive ε code
@ sin/cos power series for positive ε
dsc_h0:
@ r0:r1 ε Q64
@ r4: sin φ Q32
@ r5: cos φ Q32
umull r6,r7,r1,r1
umull r2,r3,r0,r1
lsrs r7,#1
rrx r6,r6
adds r2,r6,r3
adc r3,r7,#0
@ r2:r3 ε²/2 Q64
umull r7,r6,r3,r0
umlal r7,r6,r2,r1
movs r7,#0
umlal r6,r7,r3,r1
@ r6:r7 ε³/2 Q64
mov r8,#0x55555555
umull r9,r6,r6,r8
mov r9,#0
umlal r6,r9,r7,r8
adds r6,r6,r9
adcs r7,r9,#0
@ r6:r7 ε³/6 Q64
subs r6,r0,r6
sbcs r7,r1,r7
@ r6:r7 ε-ε³/6 Q64
mov r0,r3,lsl#12 @ ε²/2 Q44
orr r0,r0,r2,lsr#20
umull r0,r9,r0,r0 @ r9: ε⁴/4 Q88-32=Q56 = ε⁴/32 Q59
umlal r0,r9,r9,r8 @ r9: ε⁴/24 Q59
umull r0,r8,r9,r1 @ ε⁵/24 Q59
mov r0,#0x33333333
umull r0,r8,r8,r0 @ r8: ε⁵/120 Q59
@ r6:r7 ε-ε³/6+ε⁵/120 Q64
smmulr r10,r8,r1 @ r10: ε⁶/120 Q59
movlong r0,0x2aaaaaaa
smmlsr r9,r0,r10,r9 @ ε⁴/24-ε⁶/720
subs r2,r2,r9,lsl#5
sbcs r3,r3,r9,lsr#27
@ r2:r3 ε²/2-ε⁴/24+ε⁶/720 Q64
smmulr r10,r10,r1 @ r10: ε⁷/120 Q59
mov r0,#0x6180000 @ 1/42 Q64
smmlsr r8,r10,r0,r8 @ r8: ε⁵/120-ε⁷/5040 Q59
adds r6,r6,r8,lsl#5
adcs r7,r7,r8,lsr#27
bx r14
dsc_h0s:
@ postprocess for sine, positive ε
@ r2:r3 1-cos |ε| Q64
@ r4: sin φ Q31
@ r5: cos φ Q31
@ r6:r7 sin |ε| Q64
umull r8,r0,r2,r4
mov r1,#0
umlal r0,r1,r3,r4
@ r0:r1 sin φ(1-cos ε)
umull r8,r3,r6,r5
umlal r3,r4,r7,r5
@ r3:r4 sin φ+cos φ.sin ε
subs r0,r3,r0
sbc r1,r4,r1
@ r0:r1 sin φ+cos φ.sin ε - sin φ(1-cos ε) = sin φ.cos ε + cos φ.sin ε = sin(φ+ε)
bx r14
dsc_h1s:
@ postprocess for sine, negative ε
@ r2:r3 1-cos |ε| Q64
@ r4: sin φ Q31
@ r5: cos φ Q31
@ r6:r7 sin |ε| Q64
umull r8,r0,r2,r4
mov r1,#0
umlal r0,r1,r3,r4
@ r0:r1 sin φ(1-cos ε)
umull r8,r3,r6,r5
rsbs r4,r4,#0
umlal r3,r4,r7,r5
@ r3:r4 -sin φ-cos φ.sin ε
adds r0,r3,r0
adc r1,r4,r1
@ r0:r1 -sin φ-cos φ.sin ε + sin φ(1-cos ε) = -sin φ.cos ε - cos φ.sin ε = -sin(φ+ε)
bx r14
dsc_h0c:
@ postprocess for cosine, positive ε
@ r2:r3 1-cos |ε| Q64
@ r4: sin φ Q32
@ r5: cos φ Q32
@ r6:r7 sin |ε| Q64
umull r8,r0,r2,r5
mov r1,#0
umlal r0,r1,r3,r5
@ r0:r1 cos φ(1-cos ε)
umull r8,r3,r6,r4
rsbs r5,#0
umlal r3,r5,r7,r4
@ r3:r5 -cos φ+sin φ.sin ε
adds r0,r3,r0
adc r1,r5,r1
@ r0:r1 -cos φ+sin φ.sin ε + cos φ(1-cos ε) = - cos φ.cos ε + sin φ.sin ε = -cos(φ+ε)
bx r14
dsc_h1c:
@ postprocess for cosine, negative ε
@ r2:r3 1-cos |ε| Q64
@ r4: sin φ Q32
@ r5: cos φ Q32
@ r6:r7 sin |ε| Q64
umull r8,r0,r2,r5
mov r1,#0
umlal r0,r1,r3,r5
@ r0:r1 cos φ(1-cos ε)
umull r8,r3,r6,r4
umlal r3,r5,r7,r4
@ r3:r5 cos φ-sin φ.sin ε
subs r0,r3,r0
sbc r1,r5,r1
@ r0:r1 cos φ-sin φ.sin ε - cos φ(1-cos ε) = cos φ.cos ε - sin φ.sin ε = cos(φ+ε)
bx r14
double_section dpack
@ dnegpack: negate and pack
@ dpack_q63:
@ input
@ r0:r1 Q63 result, must not be zero
@ r4 exponent offset [dpack_q only]
@ output
@ r0:r1 IEEE double
@ trashes r2,r3,r4
.thumb_func
dnegpack_q63:
rsbs r0,r0,#0
sbc r1,r1,r1,lsl#1
.thumb_func
dpack_q63:
mov r4,#0x3ff-12 @ exponent
.thumb_func
dpack_q:
clz r2,r1
cmp r2,#12
bhs 1f
adds r2,#21
lsls r3,r1,r2
rsb r2,#32
lsrs r0,r0,r2 @ save rounding bit in carry
lsr r1,r1,r2
add r2,r2,r4
adcs r0,r0,r3 @ with rounding
adc r1,r1,r2,lsl#20 @ insert exponent
bx r14
1:
cbz r1,2f
rsb r2,#43
lsrs r3,r0,r2
rsb r2,#32
lsls r1,r1,r2
lsls r0,r0,r2
orrs r1,r3
sub r2,r4,r2
add r1,r1,r2,lsl#20
bx r14
2:
movs r1,r0
mov r0,#0
sub r4,#32
bne dpack_q
bx r14
double_section dreduce
@ input:
@ r0:r1 x, double (only mantissa bits used)
@ r2 quotient offset
@ r3 exponent e of x with 0x3ff bias removed, -32≤e<12 so 2^-32≤x<2^12
@ r4:r5 r Q64, 0.5≤r<1
@ r6 1/r underestimate Q31
@ output:
@ r0:r1 x mod r Q64 [possibly slightly > r?]
@ r2 quotient+offset
@ r4:r5 r preserved
@ trashes r7,r8
@ increases r2 by up to 2^13
@ this version only used by dexp
.thumb_func
dreduce:
movs r7,#1
bfi r1,r7,#20,#12 @ insert implied 1, clear exponent and sign
lsls r8,r7,r3
beq 1f @ e<0, x<1
umull r0,r7,r0,r8
mla r1,r1,r8,r7
@ r0:r1 x Q52
umull r7,r8,r1,r6 @ Q83-32=Q51
lsrs r6,r8,#19 @ q Q0
adds r2,r2,r6
umull r7,r8,r6,r4
mla r8,r6,r5,r8 @ r7:r8 q*r Q64
lsls r1,#12
orrs r1,r1,r0,lsr#20
rsbs r0,r7,r0,lsl#12
sbc r1,r1,r8 @ x-qr Q64
@ check we never return slightly more than r
cmp r1,r5 @ quick comparison
it lo
bxlo r14
b 2f
1:
adds r3,#12
movs r7,#1
lsls r8,r7,r3
beq 1f @ e<-12
umull r0,r7,r0,r8
mla r1,r1,r8,r7 @ x Q64
cmp r1,r5 @ quick comparison
it lo
bxlo r14
2:
it eq
cmpeq r0,r4
it lo
bxlo r14
subs r0,r0,r4 @ subtract one r
sbc r1,r1,r5
adds r2,#1
bx r14
1:
@ here e<-12, have to shift r0:r1 down by -r3 places
add r3,#32
lsls r6,r1,r3
rsbs r3,#32
lsrs r0,r0,r3
lsrs r1,r1,r3
orrs r0,r0,r6
bx r14
double_section exptab
.align 2
exptab0:
.quad 0x0000000000000000,0x0f85186008b15304,0x1e27076e2af2e5c8,0x2f57120421b2120d
.quad 0x3f7230dabc7c5512,0x4e993155a517a717,0x5fabe0ee0abf0d9d,0x6fad36769c6defe3
.quad 0x7ebd623de3cc7b69,0x8f42faf3820681f0,0x9ec813538ab7d537,0xaf70154920b3ab7f
exptab1:
.quad 0x0000000000000000,0x00ff805515885e02,0x01fe02a6b1067890,0x02fb88ebf0214edc
.quad 0x03f815161f807c7a,0x04f3a910d1a95d3c,0x05ee46c1f56c46aa,0x06e7f009ebe465ff
.quad 0x07e0a6c39e0cc013,0x08d86cc491ecbfe1,0x09cf43dcff5eafd5,0x0ac52dd7e4726a46
.quad 0x0bba2c7b196e7e23,0x0cae41876471f5bf,0x0da16eb88cb8df61,0x0e93b5c56d85a909
.quad 0x0f85186008b15331,0x1075983598e47130
exptab2:
.quad 0x000fff8005551559,0x002ffb808febc309,0x004ff3829a0e91b1,0x006fe78722fde71f
.quad 0x008fd78f299aa0c3,0x00afc39bac66434f,0x00cfabada9832a41,0x00ef8fc61eb4b74f
.quad 0x010f6fe6095f81b6,0x012f4c0e66898567,0x014f244032da521a,0x016ef87c6a9b3a48
.quad 0x018ec8c409b781ff,0x01ae95180bbc8d9c,0x01ce5d796bda1070,0x01ee21e924e23b3a
logtab0:
.quad 0xa0ec7f4233957338,0x918986bdf5fa1431,0x8391f2e0e6fa026b,0x7751a813071282e6
.quad 0x6a73b26a68212621,0x5fabe0ee0abf0d9d,0x546ab61cb7e0b419,0x48a507ef3de59695
.quad 0x3c4e0edc55e5cbd1,0x32a4b539e8ad68ce,0x289a56d996fa3ccb,0x21aefcf9a11cb2c9
.quad 0x16f0d28ae56b4b86,0x0f85186008b15304,0x07e0a6c39e0cc002,0x0000000000000000
.align 2
exprrdata:
.quad 0xB17217F7D1CF79AC @ ln2 Q64
.long 0xB8AA3B29 @ 1/ln2 Q31, rounded down
double_wrapper_section exp
2:
@ could use dadd macro to calculate x+1 here
lsl r0,r1,#11
orr r0,#0x80000000
lsls r1,#1
adc r3,r3,#32
movlong r1,0x3ff00000
rsb r3,#11
lsr r0,r3
it cc
bxcc r14
rsbs r0,#0
sbc r1,r1,#0
bx r14
wrapper_func exp
movs r12,r1,lsr#31 @ save sign
ubfx r3,r1,#20,#11 @ get exponent
sub r3,r3,#0x3ff
cmp r3,#12
bge 20f @ overflow, Inf or NaN?
cmn r3,#0x20
ble 2b @ <2^-32? return x+1
push {r4-r8,r14}
ldr r4,=exprrdata
ldmia r4,{r4-r6}
mov r2,#0
bl dreduce
tst r12,#1
beq 1f
mvn r2,r2 @ quotient is now signed
subs r0,r4,r0
sbc r1,r5,r1
1:
add r12,r2,#0x3fe @ exponent offset
mov r3,#0x7fe
cmp r12,r3
bhs 1f @ under/overflow
lsrs r2,r1,#28
ldr r3,=exptab0
add r3,r3,r2,lsl#3
ldmia r3,{r2-r3}
and r5,r2,#63
orr r5,#64 @ y=(t&0x3f)+0x40; Q6
subs r0,r2
sbcs r1,r3
lsrs r2,r1,#24
ldr r3,=exptab1
add r3,r3,r2,lsl#3
ldmia r3,{r3-r4}
add r2,#256
muls r5,r5,r2 @ y Q14
subs r0,r3
sbcs r1,r4
lsrs r2,r1,#21
ldr r3,=exptab2
add r3,r3,r2,lsl#3
ldmia r3,{r3-r4}
add r2,r2,r2
add r2,#4096
mla r5,r5,r2,r5 @ y Q26
subs r0,r3
sbcs r1,r4
movs r2,r1,lsl#10
orrs r2,r2,r0,lsr#22
adc r2,r2,#0 @ ε Q42, rounded
smull r3,r4,r2,r2 @ ε² Q84-32=Q52
lsrs r3,#21
orrs r3,r3,r4,lsl#11
adds r0,r0,r3
adc r1,r1,r4,lsr#21 @ ε+ε²/2 Q64
smull r3,r4,r4,r2 @ Q52*Q42=Q94; Q94-32=Q62
mov r3,#0x55555555 @ 1/6 Q33
smull r3,r4,r3,r4 @ ε³/6 Q63
smmulr r3,r4,r1 @ ε⁴/6+ε⁵/12 Q63+Q32-32=Q63
add r4,r4,r3,lsr#2
adds r2,r0,r4,lsl#1
adc r3,r1,r4,asr#31
lsls r1,r5,#3 @ y Q29
umull r4,r0,r1,r2 @ εlo * y Q61+32
smlal r0,r1,r1,r3 @ εhi * y + y Q61
@ assert result is in range 1..2
lsrs r0,#9
adcs r0,r0,r1,lsl#23
lsr r1,#9
adcs r1,r1,r12,lsl#20
pop {r4-r8,r15}
20:
@ process Inf/NaN for dexp
cmp r3,#0x400
bne 22f
orrs r2,r0,r1,lsl#12
ite eq
biceq r1,r1,r1,asr#31 @ +Inf -> + Inf; -Inf -> +0
orrne r1,r1,#0x00080000
bx r14
22:
movs r0,#0
movs r1,#0
tst r12,#1
it eq
movteq r1,0x7ff0
bx r14
1: @ under/overflow
mov r0,#0
mov r1,#0
it ge
movtge r1,#0x7ff0
pop {r4-r8,r15}
double_wrapper_section log
1:
movlong r1,0xfff00000 @ return -Inf
movs r0,#0
bx r14
4:
orrs r2,r0,r1,lsl#12
it ne
orrne r1,#0x00080000
bx r14
10:
mvns r5,r6,asr#22 @ very small argument?
bne 10f
mov r4,#4096
b 11f
@ check for argument near 1: here
@ r1 : mantissa
@ r12: exponent, -1 or 0
12:
eor r3,r12,r1,lsr#12
lsls r3,r3,#24 @ check 8 bits of mantissa
bne 12f @ not very close to 1
cmp r12,#0
bne 13f
@ argument is 1+ε, result will be positive
lsls r1,#19
orrs r1,r1,r0,lsr#13
lsls r0,#19
@ r0:r1 ε Q71 0≤ε<2^-8
clz r4,r1 @ r4≥1
cmp r4,#32
bhs 14f
movs r5,#1
lsls r5,r4
umull r2,r3,r0,r5
mla r3,r1,r5,r3 @ r2:r3 ε Q71+r4
umull r12,r5,r0,r3
umull r12,r6,r1,r2
umaal r5,r6,r1,r3 @ r5:r6 ε² Q142+r4-64 = Q78+r4
subs r2,r2,r5,lsr#8
sbc r3,r3,#0
subs r2,r2,r6,lsl#24
sbcs r3,r3,r6,lsr#8
umull r12,r7,r0,r6
umull r12,r8,r1,r5
umaal r7,r8,r1,r6 @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=1 and Q86, so r8<0x40000000
mov r5,#0x55555555 @ ~1/3 Q32
umull r12,r6,r7,r5
movs r12,#0
umlal r6,r12,r8,r5
adds r6,r6,r12
adc r12,r12,#0 @ multiply by 0x5555555555555555
adds r2,r2,r6,lsr#14
adc r3,r3,#0
adds r2,r2,r12,lsl#18
adc r3,r3,r12,lsr#14
smmulr r5,r8,r1 @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32
movs r7,#0x33333333 @ 1/5 Q32
smmulr r6,r5,r1 @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40
smmulr r8,r6,r7 @ ε⁵/5 Q67+r4 ~ 2^-42
sub r7,r5,r8,lsr#5
smmulr r5,r6,r1 @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48
movt r6,#0x2a80 @ 1/6 Q32 fiddled slightly (PMC)
smmulr r5,r6,r5 @ ε⁶/6 Q75+r4 ~ 2^-50
add r7,r7,r5,lsr#12
subs r0,r2,r7,lsl#9
sbc r1,r3,r7,lsr#23
rsb r4,#0x400
sub r4,#0x15
bl dpack_q
pop {r4-r8,r15}
@ here we have positive ε sufficiently small we need (at most) a quadratic term
14:
@ here r0=ε Q71, 0≤ε<2^-40
clz r4,r0
lsls r1,r0,r4 @ ε Q71+r4
umull r2,r3,r0,r1 @ ε² Q142+r4
mov r0,#0
subs r0,r0,r3,lsr#8
sbc r1,r1,#0
rsb r4,#0x400
sub r4,#0x35
bl dpack_q
pop {r4-r8,r15}
13: @ argument is 1-ε, result will be negative
movs r1,r1,lsl#18
orrs r1,r1,r0,lsr#14
movs r0,r0,lsl#18
rsbs r0,#0
sbc r1,r1,r1,lsl#1
@ r0:r1 -ε Q71 -2^-9≤ε<0
clz r4,r1
cmp r4,#32
bhs 15f
subs r4,#1 @ 0≤r4<31
movs r5,#1
lsls r5,r4
umull r2,r3,r0,r5
mla r3,r1,r5,r3 @ r2:r3 ε Q71+r4
umull r12,r5,r0,r3
umull r12,r6,r1,r2
umaal r5,r6,r1,r3 @ r5:r6 ε² Q142+r4-64 = Q78+r4
adds r2,r2,r5,lsr#8
adc r3,r3,#0
adds r2,r2,r6,lsl#24
adcs r3,r3,r6,lsr#8
umull r12,r7,r0,r6
umull r12,r8,r1,r5
umaal r7,r8,r1,r6 @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=0 and Q85, so r8<0x20000000
mov r5,#0x55555555 @ ~1/3 Q32
umull r12,r6,r7,r5
movs r12,#0
umlal r6,r12,r8,r5
adds r6,r6,r12
adc r12,r12,#0 @ multiply by 0x5555555555555555
adds r2,r2,r6,lsr#14
adc r3,r3,#0
adds r2,r2,r12,lsl#18
adc r3,r3,r12,lsr#14
smmulr r5,r8,r1 @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32
movs r7,#0x33333333 @ 1/5 Q32
smmulr r6,r5,r1 @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40
smmulr r8,r6,r7 @ ε⁵/5 Q67+r4 ~ 2^-42
add r7,r5,r8,lsr#5
smmulr r5,r6,r1 @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48
movt r6,#0x2a80 @ 1/6 Q32 fiddled slightly (PMC)
smmulr r5,r6,r5 @ ε⁶/6 Q75+r4 ~ 2^-50
add r7,r7,r5,lsr#12
adds r0,r2,r7,lsl#9
adc r1,r3,r7,lsr#23
rsb r4,#0x400
sub r4,#0x15
bl dpack_q
orr r1,r1,#1<<31
pop {r4-r8,r15}
@ here we have negative ε sufficiently small we need (at most) a quadratic term
@ here r0=ε Q71, |ε|<2^-41
15:
clz r4,r0
lsls r1,r0,r4 @ ε Q71+r4
umull r2,r3,r0,r1 @ ε² Q142+r4
mov r0,r3,lsr#8
rsb r4,#0x400
sub r4,#0x35
bl dpack_q
eors r1,r1,#0x80000000
pop {r4-r8,r15}
wrapper_func log
lsls r12,r1,#1
bcs 1b @ x<0?
lsrs r12,#21
beq 1b @ x==0/denormal?
sub r12,#0x3ff
cmp r12,#0x400 @ +Inf/NaN?
beq 4b
movs r2,#1
bfi r1,r2,#20,#12 @ set implied 1, clear exponent Q52
push {r4-r8,r14}
cmp r12,r12,asr#31 @ exponent = -1 or 0?
beq 12b
12:
lsrs r4,r1,#16
ldr r5,=logtab0-16*8
add r5,r5,r4,lsl#3
ldmia r5,{r2-r3}
and r5,r2,#63
add r5,#64
umull r0,r6,r5,r0
mla r1,r5,r1,r6 @ Q59
mvn r4,r1,asr#19
and r4,#31
ldr r5,=exptab1
add r5,r5,r4,lsl#3
ldmia r5,{r5-r6}
adds r2,r2,r5
adc r3,r3,r6
add r4,#256
umull r0,r6,r4,r0
mla r6,r4,r1,r6 @ r0:r6 Q67
mvns r4,r6,asr#24
beq 10b @ small argument at this stage?
10:
ldr r5,=exptab2
add r5,r5,r4,lsl#3
ldmia r5,{r1,r5}
adds r2,r2,r1
adc r3,r3,r5
mov r5,#4097
add r4,r5,r4,lsl#1 @ 4097+2k
11:
lsls r4,#17
umull r5,r0,r4,r0
rsb r1,r4,r4,lsl#3
umlal r0,r1,r4,r6 @ Q96=Q64
@ r0:r1 ε Q64
@ r2:r3 y Q64
eor r4,r0,r1,asr#31
eor r5,r1,r1,asr#31 @ r4:r5 |ε| Q64
umull r6,r7,r5,r5
umull r4,r8,r4,r5
lsrs r7,#1
rrx r6,r6
adds r6,r6,r8
adc r7,r7,#0 @ r6:r7 ε²/2 Q64
movs r4,r1,lsl#10
orrs r4,r4,r0,lsr#22
adc r4,r4,#0 @ ε Q42, rounded
subs r0,r0,r6
sbc r1,r1,r7 @ r0:r1 ε-ε²/2 Q64
smmulr r5,r4,r4 @ ε² Q42+42-32=Q52
smmulr r6,r5,r4 @ ε³ Q52+42-32=Q62
smmulr r7,r5,r5 @ ε⁴ Q52+52-32=Q72 ε⁴/4 Q74
mov r4,#0x55555555 @ 1/3 Q32
smmulr r6,r6,r4 @ ε³/3 Q62
subs r6,r6,r7,lsr#12 @ ε³/3-ε⁴/4 Q62
adds r0,r0,r6,lsl#2 @ Q64
adc r1,r1,r6,asr#30
subs r0,r2,r0
sbc r1,r3,r1
ldr r2,=exprrdata
ldmia r2,{r2,r5}
adds r12,#1
ble 1f
@ positive result
umull r2,r3,r2,r12
movs r4,#0
umlal r3,r4,r5,r12
subs r2,r2,r0
sbcs r3,r3,r1
sbc r4,r4,#0
movs r1,#0x40000000
b 2f @ to pack result
@ negative result
1:
rsbs r12,#0
umull r2,r3,r2,r12
movs r4,#0
umlal r3,r4,r5,r12
adds r2,r0,r2
adcs r3,r1,r3
adc r4,r4,#0
movs r1,#0xc0000000
2:
cbnz r4,2f
movs r4,r3
movs r3,r2
movs r2,#0
subs r1,#32<<20
1:
@ here r3:r4 is guaranteed nonzero
cbnz r4,2f
movs r4,r3
movs r3,#0
subs r1,#32<<20
2:
clz r5,r4
sub r6,r5,#0x1d
sub r1,r1,r6,lsl#20
lsls r4,r5
lsls r0,r3,r5
rsb r5,#32
lsrs r3,r5
orrs r4,r4,r3
lsrs r2,r5
orrs r0,r0,r2
@ now r0:r4 is normalised to Q63
lsrs r0,#11
adcs r0,r0,r4,lsl#21 @ with rounding
adc r1,r1,r4,lsr#11
pop {r4-r8,r15}
@===========================================
double_section trigtab
trigtab:
// φ Q64 lo φ Q64 hi sin φ Q31 cos φ Q31
.long 0x25735c0b, 0x03f574a9, 0x01fab529, 0x7ffc1500
.long 0x00aa2feb, 0x0c44d954, 0x0621d2c8, 0x7fda601f
.long 0x42e86336, 0x13d9d3cf, 0x09ea5e3c, 0x7f9d88f0
.long 0x046dc42f, 0x1c0a86d9, 0x0dfe171f, 0x7f3b9ed0
.long 0xec509ba7, 0x23ebf53e, 0x11e6e7bc, 0x7ebdefc7
.long 0x039c1cd2, 0x2bf112b2, 0x15dcf546, 0x7e1e7749
.long 0x3d7a05ca, 0x33f293f4, 0x19cbc014, 0x7d5fac9c
.long 0x7aefa0a0, 0x3c19321d, 0x1dc6221d, 0x7c7d2f2f
.long 0x12fb0fb5, 0x4450ef70, 0x21c10c53, 0x7b782235
.long 0x476b8d5c, 0x4bf1a045, 0x256adc18, 0x7a68ad16
.long 0x576940c1, 0x53ead96d, 0x29361639, 0x792f2d3c
.long 0xd34eeadf, 0x5bb34289, 0x2ce039d6, 0x77e02625
.long 0x31ea5069, 0x641107d9, 0x30c4d3c0, 0x76586270
.long 0xf36756cb, 0x6bfda2b3, 0x34687e1c, 0x74c77a22
.long 0x2f7aed0e, 0x73e07531, 0x37fae7cc, 0x731c1127
.long 0xfcc48ec0, 0x7c14790e, 0x3ba3a70e, 0x7141cd8e
.long 0x3b04b713, 0x83547b8f, 0x3ed289d4, 0x6f85d8eb
.long 0x135b369a, 0x8c0b6fd9, 0x4294ead4, 0x6d51efa3
.long 0x4aca525f, 0x9439d326, 0x460a6e70, 0x6b230540
.long 0x41c44083, 0x9c0e0b34, 0x4948b03d, 0x68f1ef07
.long 0xa36d08bf, 0xa47961fd, 0x4cb1f285, 0x667a849d
.long 0xc8f81636, 0xabe547d2, 0x4fa2221e, 0x6436622f
.long 0xd0c8c9ad, 0xb3feade1, 0x52c37024, 0x61a4af86
.long 0x64730e73, 0xbbfe356a, 0x55c5f028, 0x5f02a3a3
.long 0xb08ca5c6, 0xc412b955, 0x58ba9208, 0x5c4194a3
.long 0x54530090, 0xcbb02d37, 0x5b6ef419, 0x59938ed8
.long 0xa18d7c36, 0xd3aa3c6e, 0x5e2e0225, 0x56af32fc
.long 0x48a2c7d0, 0xdbff19f6, 0x60f35332, 0x5392ed7e
.long 0x7aad9a94, 0xe422ade1, 0x638edfca, 0x50732bc7
.long 0x19ef15ff, 0xebfb85bd, 0x65fa18bb, 0x4d5c61c7
.long 0x7e0f96bd, 0xf44c4b49, 0x686f81f7, 0x4a0217e6
.long 0x1def09eb, 0xfc4dbdfd, 0x6ab2d2c4, 0x46b4e413
// maximum e=0.002617 = 0.167497*2^-6
double_wrapper_section atan2
@ datan small reduced angle case I
20:
@ r0:r1 has z=y/x in IEEE format, <2^-11
@ r2 e+11
rsbs r12,r2,#0 @ shift down of mantissa required to get to Q63 >0
subs r10,r12,#32
bge 1f @ very small reduced angle?
bfi r1,r3,#20,#12 @ fix up mantissa
cmp r7,#4
bhs 2f @ at least one quadrant to add? then don't need extreme accuracy
@ otherwise we need to do a power series for accuracy
rsbs r10,#0
lsr r3,r1,r12
lsr r2,r0,r12
lsl r9,r1,r10
orr r2,r9
@ r2:r3 z Q63 (with r12 bits of loss of precision)
lsls r1,#11
orrs r1,r1,r0,lsr#21
lsls r0,#11
@ r0:r1 z Q74+r12
umull r9,r4,r2,r3
umull r9,r5,r2,r3
umaal r4,r5,r3,r3
@ r4:r5 z² Q62 < 2^-22
umull r9,r2,r0,r5
umull r9,r3,r1,r4
umaal r2,r3,r1,r5
@ r2:r3 z³ Q72+r12 <2^-33
umull r9,r6,r2,r5
umull r9,r8,r3,r4
umaal r6,r8,r3,r5
@ r6:r8 z⁵ Q70+r12 <2^-55; in fact r8 is always 0
mov r9,#0xaaaaaaaa @ 2/3 Q32
umull r10,r4,r2,r9
movs r5,#0
umlal r4,r5,r3,r9
adds r4,r4,r5
adcs r5,r5,#0
@ r4:r5 z³*2/3 Q72+r12 = z³/3 Q73+r12
mov r9,#0x33333333 @ 1/5 Q32
umull r2,r3,r6,r9
@ r3 z⁵*1/5 Q70+r12
subs r4,r4,r3,lsl#3
sbc r5,r5,#0
@ r4:r5 z³/3-z⁵/5 Q73+r12
subs r0,r0,r4,lsl#1
sbc r1,r1,r5,lsl#1
sub r1,r1,r4,lsr#31
@ r0:r1 z-z³/3+z⁵/5 Q74+r12
rsb r4,r12,#0x400
sub r4,#24
b 60f @ pack and return
2:
rsbs r10,#0
lsls r4,r1,r10
lsrs r0,r0,r12
lsrs r1,r1,r12
orrs r0,r0,r4 @ shift down r12 places
b 50f
1:
cmp r7,#4
bhs 2f @ at least one quadrant to add?
eors r1,r1,r7,lsl#31 @ no: return y/x with the correct sign
pop {r4-r11,r15}
2:
bfi r1,r3,#20,#12 @ fix up mantissa
usat r10,#6,r10 @ saturate r10 to 63
lsrs r0,r1,r10
movs r1,#0 @ shift down 32+r10 places
b 40f
@ datan small reduced angle case II
10:
lsrs r1,#1
rrxs r0,r0
movs r2,#0
movs r3,#0
movs r6,#0
mov r7,#1<<30
b 11f
@ case where reduced (x',y') has x' infinite
71:
sbfx r4,r1,#20,#11
movs r0,#0
movs r1,#0
cmn r4,#1 @ y' also infinite?
bne 80f
movt r1,#0x3ff0 @ both infinite: pretend ∞/∞=1
b 80f
@ case where reduced (x',y') has y' zero
70:
ubfx r4,r3,#20,#11
movs r0,#0
movs r1,#0
cbnz r4,80f @ x' also zero?
tst r7,#4
beq 80f @ already in quadrants 0/±2? then 0/0 result will be correct
tst r7,#2
ite eq
addeq r7,#6
subne r7,#6 @ fix up when in quadrants ±0
b 80f
90:
movs r0,r2
movs r1,r3
91:
orrs r1,r1,#0x00080000
bx r14
wrapper_func atan2
cmp r2,#1 @ set C if low word is non-zero
adc r12,r3,r3
cmn r12,#0x00200000 @ y NaN?
bhi 90b
cmp r0,#1 @ set C if low word is non-zero
adc r12,r1,r1
cmn r12,#0x00200000 @ x NaN?
bhi 91b
push {r4-r11,r14}
lsrs r7,r1,#31 @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition
bic r1,#1<<31
@ y now positive
movs r3,r3
bpl 1f
@ here y positive, x negative
adds r7,#10
bic r3,r3,#1<<31
cmp r3,r1
bhi 4f @ |x| > y: 3rd octant
@ |x| < y: 2nd octant
subs r7,#6
b 3f
1:
@ here x and y positive
cmp r3,r1
bhi 4f
@ x < y: 1st octant
adds r7,#6
3:
movs r4,r2 @ exchange x and y
movs r5,r3
movs r2,r0
movs r3,r1
movs r0,r4
movs r1,r5
4:
@ here
@ r0:r1 y'
@ r2:r3 x'
@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is
@ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r7b3..2, the inner negation
@ is given by r7b1 and the outer negation by r7b0. x' can be infinite, or both x' and
@ y' can be infinite, but not y' alone.
sbfx r4,r3,#20,#11
cmn r4,#1
beq 71b @ x' infinite?
ubfx r4,r1,#20,#11
cmp r4,#0
beq 70b @ y' zero?
bl __aeabi_ddiv
80:
@ r0:r1 y/x in IEEE format, 0..1
lsr r2,r1,#20 @ exponent
movs r3,#1
subs r2,#0x3ff-11
bmi 20b
bfi r1,r3,#20,#12 @ fix up mantissa
movs r3,#1
lsl r3,r2
umull r0,r4,r0,r3
mla r1,r1,r3,r4
50:
push {r7} @ save flags
@ from here atan2(y,1) where 1 implied
@ r0:r1 y Q63 0≤y<1+δ
lsrs r2,r1,#16
cmp r2,#0x100
blo 10b @ small angle?
mul r3,r2,r2 @ y^2
movw r4,#0x895c
muls r2,r2,r4 @ y*0x895c
movw r5,#0x1227
lsrs r3,#14
mls r2,r3,r5,r2
subs r2,#0x330000 @ Chebyshev approximation to atn(y)
lsrs r2,#25
ldr r3,=trigtab
add r3,r3,r2,lsl#4
ldmia r3,{r2-r5}
lsrs r3,#1
rrxs r2,r2
@ r2:r3 phi0 Q63
@ r4 sphi0
@ r5 cphi0
umull r12,r6,r4,r0
movs r7,#0
umlal r6,r7,r4,r1
adds r6,r6,r5,lsl#31
adc r7,r7,r5,lsr#1 @ x0= ((i128)cphi0<<31)+(((i128)sphi0*(i128)y)>>32); // Q62
@ r6:r7 x0
umull r12,r0,r5,r0
movs r8,#0
umlal r0,r8,r5,r1
subs r0,r0,r4,lsl#31
sbc r1,r8,r4,lsr#1 @ y0=-((i128)sphi0<<31)+(((i128)cphi0*(i128)y)>>32); // Q62
11:
@ r0:r1 y0
@ r2:r3 phi0
@ r6:r7 x0
lsls r4,r1,#6
orr r4,r4,r0,lsr#26
lsrs r5,r7,#15
sdiv r4,r4,r5 @ t2=(y0>>26)/(x0>>47); // Q62-26/Q62-47=Q21
mul r5,r4,r4 @ t2_2 Q42
add r3,r3,r4,lsl#10 @ phi0+t2
smull r8,r9,r4,r5 @ t2_3 Q63
mov r10,r9,lsl#16
orr r10,r10,r8,lsr#16
smmulr r10,r10,r5 @ t2_5 Q57
mov r12,#0x66666666 @ 1/5 Q33
smmulr r11,r10,r5 @ t2_7 Q67
smmulr r10,r10,r12 @ t2_5/5 Q57+33-32=Q58
movlong r12,0x124925 @ 1/7 Q23
smmulr r11,r11,r12 @ t2_7/7 Q67+23-32=Q58
mov r12,#0x55555555
sub r11,r11,r11,asr#12 @ Q58 PMC correction
sub r10,r10,r11
adds r2,r2,r10,lsl#5
adc r3,r3,r10,asr#27 @ Q63 phi0 + t_2 + t2_5/5 - t2_7/7 + t2_7/7/4096
umull r5,r10,r8,r12
mov r11,#0
smlal r10,r11,r9,r12 @ t2_3 * 0x55555555
adds r10,r10,r11
adc r11,r11,r11,asr#31 @ t2_3/3 Q63
subs r2,r2,r10
sbc r3,r3,r11 @ Q63 phi0+phi1
lsls r4,r4,#11 @ t2 Q32
umull r5,r8,r4,r0 @ t2*y0l
it mi
submi r8,r8,r0 @ correction if t2 is negative
mov r9,r8,asr#31 @ sign extend
smlal r8,r9,r4,r1 @ t2*y0h
@ r8:r9 (t2*y0)<<11
umull r5,r10,r4,r6 @ t2*x0l
it mi
submi r10,r10,r6 @ correction if t2 is negative
mov r11,r10,asr#31 @ sign extend
smlal r10,r11,r4,r7
@ r10:r11 (t2*x0)<<11
adds r6,r8
adc r7,r7,r9
subs r0,r10
sbc r1,r1,r11
@ r0:r1 y1=y0-t2*x0
@ r2:r3 phi0+phi1
@ r6:r7 x1=x0+t2*y0
mov r4,#0xffffffff
lsrs r5,r7,#14
udiv r4,r4,r5 @ rx1 Q16
lsrs r5,r0,#11
orrs r5,r5,r1,lsl#21 @ N set according to y1, hence also t3
smmul r5,r4,r5 @ t3=(y1>>11)*rx1 Q35
lsr r6,r6,#3
orr r6,r6,r7,lsl#29
umull r11,r8,r5,r6 @ t3*x1l
lsr r10,r7,#3
it mi
submi r8,r8,r6 @ correction if t3 is negative
mla r8,r5,r10,r8
adds r2,r2,r5,lsl#28
adc r3,r3,r5,asr#4
sub r0,r0,r8
@ r0: y2
@ r2:r3 phi0+phi1+phi2
@ r4: rx1
@ r5: t3
smull r8,r9,r0,r4 @ y2*rx1
@ stall
lsrs r8,#14
orr r8,r8,r9,lsl#18 @ t4
smmlsr r0,r8,r7,r0
adds r2,r2,r8,asr#1
adc r3,r3,r8,asr#31
@ r0: y3
@ r4: rx1
mul r4,r4,r0
adds r0,r2,r4,asr#15
adc r1,r3,r4,asr#31
@ r0:r1 result over reduced range Q63
pop {r7} @ restore flags
40:
lsrs r1,#1
rrxs r0,r0
@ r0:r1 result over reduced range Q62
lsl r6,r7,#30 @ b1 -> b31
eor r0,r0,r6,asr#31 @ negate if required
eor r1,r1,r6,asr#31
movlong r2,0x10B4611A @ π/2 Q62 low word
movlong r3,0x6487ED51 @ π/2 Q62 high word
lsr r6,r7,#2 @ quadrants to add
umlal r0,r1,r6,r2
mla r1,r6,r3,r1
mov r4,#0x400-12 @ for packing Q62
60:
bl dpack_q
eors r1,r1,r7,lsl#31
pop {r4-r11,r15}
#endif