| /* |
| * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause |
| */ |
| |
| #if !PICO_RP2040 |
| #include "pico/asm_helper.S" |
| |
| pico_default_asm_setup |
| |
| .macro float_section name |
| #if PICO_FLOAT_IN_RAM |
| .section RAM_SECTION_NAME(\name), "ax" |
| #else |
| .section SECTION_NAME(\name), "ax" |
| #endif |
| .endm |
| |
| .macro float_wrapper_section func |
| float_section WRAPPER_FUNC_NAME(\func) |
| .endm |
| |
| .extern rtwopi |
| .extern logtab0 |
| .extern exptab0 |
| .extern exptab1 |
| .extern exptab2 |
| .extern trigtab |
| |
| @ load a 32-bit constant n into register rx |
| .macro movlong rx,n |
| movw \rx,#(\n)&0xffff |
| movt \rx,#((\n)>>16)&0xffff |
| .endm |
| |
| float_section frrcore |
| |
| @ input: |
| @ r0 mantissa m Q23 |
| @ r1 exponent e>=-32, typically offset by +9 |
| @ output: |
| @ r0..r1 preserved |
| @ r6 range reduced result |
| @ r2,r3,r4,r5 trashed |
| frr_core: |
| ldr r2,=rtwopi |
| asrs r3,r1,#5 @ k=e/32, k<=5 for e offsets up to 9+32 |
| add r2,r2,r3,lsl#2 @ p |
| and r3,r1,#31 @ s=e%32 |
| mov r4,#1 |
| lsls r4,r4,r3 @ 1<<s |
| umull r3,r4,r4,r0 |
| @ r2 p |
| @ r3:r4 u0:u1 = m<<(e%32); u1 is never more than 2<<23 |
| ldr r5,[r2,#12] @ a0=p[3] |
| umull r5,r6,r5,r4 @ r0=a0*u1 hi, discard lo |
| @ r6 r0 |
| ldr r5,[r2,#8] @ a1=p[2] |
| mla r6,r5,r4,r6 @ a1*u1 lo, discard hi |
| umlal r5,r6,r5,r3 @ a1*u0 hi, discard lo |
| @ r6 r0 |
| ldr r5,[r2,#4] @ a2=p[1] |
| mla r6,r5,r3,r6 @ r0+=a2*u0 |
| bx r14 |
| |
| |
| float_wrapper_section expf |
| |
| 2: |
| @ we could use fadd macro to calculate x+1 here |
| @ need to add quadratic term |
| rsb r3,#1 @ 1-(e+9) |
| lsrs r0,r3 @ Q31 |
| smmulr r1,r0,r0 @ Q30 |
| cmp r12,#0 |
| bne 1f |
| add r0,r0,r1 |
| lsrs r0,#8 |
| adc r0,r0,#0x3f800000 @ result is just over 1 |
| bx r14 |
| 1: |
| sub r0,r1,r0 |
| asrs r0,#7 |
| adc r0,r0,#0x3f800000 @ result is just under 1 |
| bx r14 |
| |
| 7: |
| movs r1,#1 |
| bfi r0,r1,#23,#9 @ implied 1, clear sign bit |
| adds r3,#9 |
| bmi 2b @ <~2^-9? return 1+x+x²/2 |
| movlong r1,0xb17217f8 @ ln2 Q32 |
| lsls r0,r3 @ Q32 |
| cmp r12,#0 |
| bne 8f |
| cmp r0,r1 |
| blo 6f |
| sub r0,r0,r1 |
| mov r12,#1 |
| b 6f |
| |
| 8: |
| mvn r12,#0 |
| subs r0,r1,r0 |
| bhs 6f |
| add r0,r0,r1 |
| mvn r12,#1 |
| b 6f |
| |
| wrapper_func expf |
| ands r12,r0,#1<<31 @ save sign |
| ubfx r3,r0,#23,#8 @ get exponent |
| subs r3,r3,#0x7f |
| bmi 7b @ e<0? |
| cmp r3,#7 |
| bge 20f @ overflow, Inf or NaN? |
| movs r1,#1 |
| bfi r0,r1,#23,#9 @ implied 1, clear sign bit |
| lsls r0,r3 @ x Q23 |
| eors r0,r0,r12,asr#31 |
| adds r0,r0,r12,lsr#31 @ negate if sign bit set |
| mov r1,#0xB8AA @ 1/ln2 Q15, rounded down |
| add r1,r1,r12,lsr#31 @ rounded up if we have a negative argument |
| smull r1,r2,r1,r0 @ Q38 |
| movlong r1,0xb17217f8 @ ln2 Q32 |
| asr r12,r2,#6 @ Q0 exponent candidate |
| lsls r0,#9 @ Q32 |
| mls r0,r12,r1,r0 @ Q32-Q0*Q32=Q32 |
| |
| 6: |
| lsrs r1,r0,#28 |
| ldr r2,=exptab0 |
| add r2,r2,r1,lsl#3 |
| ldmia r2,{r2-r3} |
| and r2,r2,#63 |
| orr r2,#64 @ y=(t&0x3f)+0x40; Q6 |
| sub r0,r3 |
| |
| lsrs r1,r0,#24 |
| ldr r3,=exptab1+4 |
| ldr r3,[r3,r1,lsl#3] |
| add r1,#256 |
| muls r2,r2,r1 @ y Q14 |
| sub r0,r3 |
| |
| lsrs r1,r0,#21 |
| ldr r3,=exptab2+4 |
| ldr r3,[r3,r1,lsl#3] |
| add r1,r1,r1 |
| add r1,#4096 |
| mla r2,r2,r1,r2 @ y Q26 |
| sub r0,r3 @ ε Q32 |
| smmlar r2,r2,r0,r2 @ y(1+ε) Q26 |
| |
| cmp r2,#1<<27 |
| bhs 1f |
| 2: |
| cmp r12,#0x7f |
| bgt 23f |
| cmn r12,#0x7e |
| blt 23f |
| lsls r0,r12,#23 |
| adds r0,r0,r2,lsr#3 |
| adc r0,r0,#0x3f000000 |
| bx r14 |
| |
| @ mantissa calculation has overflowed |
| 1: |
| lsrs r2,#1 |
| adds r12,#1 |
| b 2b |
| |
| 20: |
| @ process Inf/NaN for fexp |
| cmp r3,#0x80 |
| bne 22f |
| lsls r2,r0,#9 |
| ite eq |
| biceq r0,r0,r0,asr#31 @ +Inf -> + Inf; -Inf -> +0 |
| orrne r0,r0,#0x00400000 |
| bx r14 |
| |
| 23: |
| ands r12,r12,#1<<31 |
| 22: |
| eors r0,r12,#1<<31 |
| subs r0,r0,r0,lsr#8 @ overflow: convert to +0 or +Inf |
| bx r14 |
| |
| |
| |
| |
| float_wrapper_section logf |
| |
| 1: |
| movlong r0,0xff800000 @ return -Inf |
| bx r14 |
| |
| 4: |
| lsls r1,r0,#9 |
| it ne |
| orrne r0,r0,#0x00400000 |
| bx r14 |
| |
| @ here result may be close to zero |
| 5: |
| sub r1,r0,#0x3f800000 |
| bmi 7f |
| cmp r1,#0x8000 |
| bge 6f @ |ε|>~2^-8? |
| @ here ε positive |
| clz r12,r1 |
| sub r3,r12,#1 |
| b 8f |
| |
| 7: |
| rsbs r2,r1,#0 |
| cmp r2,#0x8000 |
| bge 6f @ |ε|>~2^-8? |
| @ here ε negative |
| @ r1: x Q24 |
| clz r12,r2 |
| sub r3,r12,#2 |
| 8: |
| sub r12,#10 |
| lsls r0,r1,r3 @ ε Q24+r3 |
| beq 10f @ ε=0? |
| smmulr r1,r0,r0 |
| asrs r1,r1,r12 |
| sub r0,r0,r1,asr#1 @ - ε²/2 |
| smmulr r1,r1,r0 |
| asrs r2,r1,r12 |
| movs r3,#0x55 |
| mul r2,r2,r3 |
| adds r0,r0,r2,asr#8 @ + ~ ε³/3 |
| rsb r1,r12,#117 |
| b 9f |
| |
| wrapper_func logf |
| lsls r3,r0,#1 |
| bcs 1b @ x<0? |
| lsrs r3,#24 |
| beq 1b @ x==0/denormal? |
| sub r12,r3,#0x7e |
| cmp r12,#0x81 @ +Inf/NaN? |
| beq 4b |
| push {r14} |
| cmp r12,#1 @ result will be near zero? |
| bls 5b |
| 6: |
| movs r2,#1 |
| bfi r0,r2,#23,#9 @ set implied 1, clear exponent Q52 |
| |
| lsls r0,#8 |
| |
| lsrs r1,r0,#27 |
| ldr r2,=logtab0-16*8 |
| add r2,r2,r1,lsl#3 |
| ldmia r2,{r2-r3} |
| lsls r2,#26 |
| umlal r2,r0,r2,r0 |
| |
| mvn r1,r0,asr#24 |
| ldr r2,=exptab1+4 |
| ldr r2,[r2,r1,lsl#3] |
| add r3,r3,r2 |
| lsls r1,#24 |
| umlal r1,r0,r1,r0 |
| |
| ldr r2,=exptab2+4 |
| mvn r1,r0,asr#21 |
| ldr r2,[r2,r1,lsl#3] |
| lsls r1,#21 |
| orr r1,#0x00100000 |
| umlal r1,r0,r1,r0 |
| add r3,r3,r2 |
| |
| @ r0: ε Q32 |
| @ r3: log y |
| smmulr r1,r0,r0 @ ε² Q32 |
| sub r3,r0 |
| add r3,r3,r1,lsr#1 @ log u - ε + ε²/2 Q32 |
| movlong r2,0xb17218 @ ln2 Q24, but accurate to 2^-29 or so |
| cmp r12,#1 @ result will be near zero? |
| bls 2f |
| mul r2,r2,r12 |
| mov r1,#125 |
| add r3,#255 @ reduce systematic error |
| subs r0,r2,r3,lsr#8 |
| 9: |
| bmi 1f |
| bl fpack_q |
| 10: |
| pop {r15} |
| |
| 2: |
| mov r1,#117 |
| bne 3f @ r12=0? |
| @ here r12=1 |
| movlong r2,0xb17217f8 |
| sub r0,r2,r3 |
| bl fpack_q |
| pop {r15} |
| |
| 3: |
| mov r0,r3 |
| bl fpack_q |
| orrs r0,r0,#1<<31 |
| pop {r15} |
| |
| 1: |
| rsbs r0,#0 |
| bl fpack_q |
| orrs r0,r0,#1<<31 |
| pop {r15} |
| |
| |
| float_wrapper_section atan2f |
| |
| @ fatan small reduced angle case |
| @ r0 y/x in IEEE format, 0..2^-8 |
| @ r2 e+6 <0 |
| @ r3 #1 |
| @ r6 flags |
| 20: |
| rsbs r4,r2,#0 @ -e-6 = shift down of mantissa required to get to Q29 >0 |
| cmp r4,#7 @ -e-6 ≥ 7 [ e ≤ -13 ] |
| bge 1f @ very small reduced angle? atn θ ~ θ |
| @ do a power series |
| bfi r0,r3,#23,#9 @ fix up mantissa Q23-e |
| lsls r0,#7 @ Q30-e |
| smmulr r1,r0,r0 @ θ² Q60-2e-Q32=Q28-2e |
| lsrs r1,r4 @ Q28-2e + (e+6) = Q34-e |
| smmulr r1,r1,r0 @ θ³ Q34-e+Q30-e-Q32=Q32-2e |
| mov r3,#0x55555555 @ 1/3 Q32 |
| lsrs r1,r4 @ Q32-2e + e+6 = Q38-e |
| smmulr r1,r1,r3 @ θ³/3 Q38-e |
| sub r0,r0,r1,lsr#8 @ Q30-e |
| cmp r6,#4 @ at least one quadrant to add? |
| bhs 2f |
| add r1,r2,#113 |
| bl fpack_q |
| eors r0,r0,r6,lsl#31 @ fix sign |
| pop {r4-r6,r15} |
| |
| @ here reduced angle is < 2^-12 |
| 1: |
| cmp r6,#4 |
| bhs 3f @ at least one quadrant to add? |
| eors r0,r0,r6,lsl#31 @ no: return y/x with the correct sign |
| pop {r4-r6,r15} |
| |
| 3: |
| bfi r0,r3,#23,#9 @ fix up mantissa |
| lsrs r0,r4 @ Q29 |
| lsls r0,#1 @ Q30 |
| b 40f |
| |
| 2: |
| lsrs r0,r4 @ Q30-e + e+6 = Q36 |
| lsrs r0,#6 @ Q30 |
| b 40f |
| |
| @ case where reduced (x',y') has x' infinite |
| 71: |
| sbfx r4,r0,#23,#8 |
| movs r0,#0 |
| cmn r4,#1 @ y' also infinite? |
| bne 80f |
| mov r0,#0x3f800000 @ both infinite: pretend ∞/∞=1 |
| b 80f |
| |
| @ case where reduced (x',y') has y' zero |
| 70: |
| ubfx r4,r1,#23,#8 |
| movs r0,#0 |
| cbnz r4,80f @ x' also zero? |
| tst r6,#4 |
| beq 80f @ already in quadrants 0/±2? then 0/0 result will be correct |
| tst r6,#2 |
| ite eq |
| addeq r6,#6 |
| subne r6,#6 @ fix up when in quadrants ±0 |
| b 80f |
| |
| 90: |
| movs r0,r1 |
| 91: |
| orrs r0,r0,#0x00400000 |
| pop {r4-r6,r15} |
| |
| wrapper_func atan2f |
| push {r4-r6,r14} |
| lsls r4,r1,#1 |
| cmp r4,#0xff000000 |
| bhi 90b @ y NaN? |
| lsls r4,r0,#1 |
| cmp r4,#0xff000000 |
| bhi 91b @ x NaN? |
| lsrs r6,r0,#31 @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition |
| bic r0,#1<<31 |
| @ y now positive |
| movs r1,r1 |
| bpl 1f |
| |
| @ here y positive, x negative |
| adds r6,#10 |
| bic r1,r1,#1<<31 |
| cmp r1,r0 |
| bhi 4f @ |x| > y: 3rd octant |
| @ |x| < y: 2nd octant |
| subs r6,#6 |
| b 3f |
| |
| 1: |
| @ here x and y positive |
| cmp r1,r0 |
| bhs 4f |
| @ x < y: 1st octant |
| adds r6,#6 |
| 3: |
| movs r2,r1 @ exchange x and y |
| movs r1,r0 |
| movs r0,r2 |
| 4: |
| @ here |
| @ r0 y' |
| @ r1 x' |
| @ where both x' and y' are positive, y'/x' < 1+δ, and the final result is |
| @ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r6b3..2, the inner negation |
| @ is given by r6b1 and the outer negation by r6b0. x' can be infinite, or both x' and |
| @ y' can be infinite, but not y' alone. |
| |
| sbfx r2,r1,#23,#8 |
| cmn r2,#1 |
| beq 71b @ x' infinite? |
| ubfx r2,r0,#23,#8 |
| cmp r2,#0 |
| beq 70b @ y' zero? |
| bl __aeabi_fdiv |
| 80: |
| @ r0 y/x in IEEE format, 0..1 |
| lsr r2,r0,#23 @ exponent |
| movs r3,#1 |
| subs r2,#0x7f-6 |
| bmi 20b |
| bfi r0,r3,#23,#9 @ fix up mantissa |
| lsls r0,r2 |
| lsls r0,#2 |
| 50: |
| |
| @ from here atan2(y,1) where 1 implied |
| @ r0 y Q31 0≤y<1+δ |
| |
| lsrs r2,r0,#16 |
| mul r3,r2,r2 @ y^2 |
| movw r4,#0x895c |
| muls r2,r2,r4 @ y*0x895c |
| movw r5,#0x1227 |
| lsrs r3,#14 |
| mls r2,r3,r5,r2 |
| subs r2,#0x330000 @ Chebyshev approximation to atn(y) |
| lsrs r2,#25 |
| ldr r3,=trigtab+4 |
| add r3,r3,r2,lsl#4 |
| ldmia r3,{r3-r5} |
| @ r0 y Q30 |
| @ r3 phi0 Q32 |
| @ r4 sphi0 Q31 |
| @ r5 cphi0 Q31 |
| @ r6 flags |
| @ x0= cphi0 + sphi0*y |
| @ y0=-sphi0 + cphi0*y |
| umull r12,r1,r0,r4 @ sphi0*y |
| umull r12,r2,r0,r5 @ cphi0*y |
| add r1,r1,r5,lsr#1 @ x0 Q30 |
| sub r2,r2,r4,lsr#1 @ y0 Q30 |
| 11: |
| @ r1 x0 Q30 |
| @ r2 y0 Q30 |
| @ r3 phi0 Q32 |
| @ r6 flags |
| mov r0,#0xffffffff |
| lsrs r4,r1,#15 |
| udiv r12,r0,r4 @ rx0 Q17 |
| lsls r4,r2,#6 @ y0 Q36 |
| smmul r4,r4,r12 @ t2=y0*rx0 Q21 |
| lsls r5,r4,#11 @ t2 Q32 |
| smmlar r0,r5,r2,r1 |
| smmlsr r1,r5,r1,r2 |
| @ r0 x1 Q30 |
| @ r1 y1 Q30 |
| @ r3 phi0 |
| @ r4 r2 Q21 |
| @ r12 rx0 Q17 |
| mul r5,r4,r4 @ t2_2 Q42 |
| smull r2,r5,r4,r5 @ t2_3 Q63 |
| add r3,r3,r4,lsl#11 @ Q32 |
| lsls r5,#1 @ Q32 |
| mov r2,#0x55555555 @ 1/3 |
| smmlsr r3,r2,r5,r3 @ t2_3/3 |
| |
| @ r0 x1 Q30 |
| @ r1 y1 Q30 |
| @ r3 phi0+phi1 Q32 |
| @ r12 rx0 Q17 |
| mul r0,r1,r12 @ y1*rx0 Q30+Q17=Q47 |
| add r3,r3,r0,asr#15 |
| @ r3 phi0+phi1+phi2, result over reduced range Q32 |
| @ r6 flags |
| |
| lsrs r0,r3,#2 @ Q30 |
| @ adc r0,#0 @ rounding |
| 40: |
| lsl r5,r6,#30 @ b1 -> b31 |
| eor r0,r0,r5,asr#31 @ negate if required |
| movlong r1,0x6487ED51 @ π/2 Q30 |
| |
| lsr r5,r6,#2 @ quadrants to add |
| mla r0,r5,r1,r0 |
| mov r1,#0x80-9 @ for packing Q30 |
| 60: |
| bl fpack_q |
| eors r0,r0,r6,lsl#31 |
| pop {r4-r6,r15} |
| |
| @======================================= |
| |
| float_wrapper_section fpack |
| |
| @ fnegpack: negate and pack |
| @ fpack_q31: |
| @ input |
| @ r0 Q31 result, must not be zero |
| @ r1 exponent offset [fpack_q only] |
| @ output |
| @ r0 IEEE single |
| @ trashes (r1,)r2 |
| fnegpack_q31: |
| rsbs r0,r0,#0 |
| fpack_q31: |
| mov r1,#0x7f-9 @ exponent |
| fpack_q: |
| clz r2,r0 |
| rsbs r2,#8 |
| bmi 1f |
| lsrs r0,r0,r2 @ save rounding bit in carry |
| add r2,r2,r1 |
| adc r0,r0,r2,lsl#23 @ insert exponent |
| bx r14 |
| |
| 1: |
| rsb r2,#0 |
| lsls r0,r0,r2 |
| sub r2,r1,r2 |
| add r0,r0,r2,lsl#23 |
| bx r14 |
| |
| float_wrapper_section tanf |
| |
| wrapper_func tanf |
| push {r14} |
| bl sincosf_raw |
| bl __aeabi_fdiv |
| pop {r15} |
| |
| float_wrapper_section fsin_fcos |
| |
| 10: @ process Inf/NaN for sinf and fcos |
| lsls r1,r0,#9 |
| it eq |
| orreq r0,#0x80000000 |
| orr r0,#0x00400000 @ turn Inf into NaN |
| movs r1,r0 @ copy result to cosine output |
| bx r14 |
| |
| @ case where angle is very small (<2^-32) before reduction |
| 32: |
| adds r1,r1,#0x7f |
| it eq |
| moveq r0,#0 @ flush denormal to zero |
| |
| movs r1,0x3f800000 @ return x for sine, 1 for cosine |
| tst r12,#2 |
| it ne |
| movne r0,r1 @ calculating cosine? move to result registers |
| bx r14 |
| |
| 40: |
| @ case where range-reduced angle is very small |
| @ here |
| @ r0 mantissa |
| @ r1 exponent+9 |
| @ r6 abs range-reduced angle / 2π < 2^-7 Q32 |
| @ r12b31: dsincos flag |
| @ r12b30: original argument sign |
| @ r12b2..1: quadrant count |
| @ r12b0: sign of reduced angle |
| push {r12} |
| movs r12,#0 |
| 2: |
| add r12,#2 |
| add r1,#2 |
| bl frr_core @ repeat range reduction with extra factor of 2^2 (, 2^4, 2^6, 2^8,...) |
| eors r4,r6,r6,asr#31 @ we want to keep the sign in r6b31 for later |
| cmp r4,#1<<26 @ ≥ 2^-6? |
| bhs 1f @ loop until the result is big enough |
| cmp r12,#32 @ safety net |
| bne 2b |
| 1: |
| @ here r4 is the abs range-reduced angle Q32+r12, 2^-6..2^-4 in Q32 terms |
| |
| @ 2π=6.487ED511 (0B4611A6...) |
| movlong r5,0x487ED511 @ 2π Q64 high fractional word |
| umull r2,r0,r4,r5 |
| movs r5,#6 @ 2π integer part |
| mla r0,r4,r5,r0 |
| |
| @ here |
| @ r0 θ, abs range reduced angle θ 0..π/4 Q32+r12, 2π * 2^-6..2^-4 in Q32 terms (so top bit is clear) |
| @ r6b31: sign of reduced angle |
| @ r12: excess exponent ≥2, multiple of 2 |
| @ r0 / 2^r12 < ~ 2π * 2^-7 so for sin we need to go to term in x^3 |
| |
| smmulr r1,r0,r0 @ θ² Q32+2*r12 |
| lsrs r1,r1,r12 @ θ² Q32+r12 |
| lsrs r1,r1,r12 @ θ² Q32 |
| movs r4,#0x55555555 @ 1/3 Q32 |
| smmulr r4,r1,r4 @ θ²/3 Q32 |
| smmulr r2,r4,r1 @ θ⁴/3 Q32 |
| rsb r3,r1,r2,lsr#2 @ -θ²+θ⁴/12) Q32 |
| asrs r3,#9 @ -θ²/2+θ⁴/24 Q24 |
| adc r3,r3,#0x3f800000 @ IEEE single with rounding |
| |
| smmulr r2,r4,r0 @ θ³/3 Q32+r12 |
| sub r0,r0,r2,lsr#1 @ θ-θ³/6 Q32+r12 |
| rsb r1,r12,#117 |
| bl fpack_q |
| @ here |
| @ r0 packed sin |
| @ r3 packed cos |
| @ r6b31: sign of reduced angle |
| pop {r12} @ get flags |
| lsrs r6,#31 |
| bfi r12,r6,#0,#1 |
| |
| asrs r2,r12,#1 |
| bmi 23f @ doing sincos? |
| asrs r2,#1 |
| bcc 21f @ need sine? |
| @ need cosine: |
| ands r12,#4 |
| orrs r0,r3,r12,lsl#29 @ insert sign |
| pop {r4-r6,r15} |
| |
| 21: |
| eors r12,r12,r12,lsr#2 |
| orrs r0,r0,r12,lsl#31 @ insert sign |
| pop {r4-r6,r15} |
| |
| 23: |
| ands r2,r12,#4 |
| orrs r3,r3,r2,lsl#29 @ insert sign |
| push {r3} |
| @ drop into sincosf code below... |
| b 20f |
| |
| wrapper_func sincosf |
| push {r1, r2, lr} |
| bl sincosf_raw |
| pop {r2, r3} |
| str r0, [r2] |
| str r1, [r3] |
| pop {pc} |
| |
| sincosf_raw: |
| ands r12,r0,#1<<31 |
| lsrs r12,#1 @ save argument sign in r12b30 |
| orrs r12,r12,#1<<31 @ flag we want both results in r12b31 |
| b 1f |
| |
| wrapper_func sinf |
| lsrs r12,r0,#29 @ negative argument -> +2 quadrants |
| ands r12,#4 |
| b 1f |
| |
| wrapper_func cosf |
| movs r12,#2 @ cos -> +1 quadrant |
| 1: |
| ubfx r1,r0,#23,#8 @ get exponent |
| sub r1,r1,#0x7f |
| cmp r1,#0x80 |
| beq 10b @ Inf or NaN? |
| cmn r1,#32 |
| blt 32b @ very small argument? |
| movs r3,#1 |
| bfi r0,r3,#23,#9 @ fix implied 1 in mantissa |
| push {r4-r6,r14} |
| add r1,#9 @ e+9 |
| bl frr_core |
| @ r6 θ/2π 0..1 Q64 |
| lsrs r4,r6,#30 @ quadrant count |
| adc r4,r4,#0 @ rounded |
| sub r6,r6,r4,lsl#30 @ now -0.125≤r6<+0.125 Q32 |
| add r12,r12,r4,lsl#1 |
| orr r12,r12,r6,lsr#31 @ sign into r12b0 |
| @ r12b2..1: quadrant count |
| @ r12b0: sign of reduced angle |
| eors r6,r6,r6,asr#31 @ absolute value of reduced angle 0≤r7<0.125 Q32 |
| cmp r6,#1<<25 @ θ / 2π < 2^-7? |
| blo 40b |
| |
| @ 2π=6.487ED511 (0B4611A6...) |
| movlong r5,0x487ED511 @ 2π Q64 high fractional word |
| umull r2,r0,r6,r5 |
| movs r5,#6 @ 2π integer part |
| mla r0,r6,r5,r0 |
| @ r0 range reduced angle θ 0..π/4 Q32 |
| lsrs r2,r0,#27 |
| ldr r3,=trigtab+4 |
| add r3,r3,r2,lsl#4 |
| ldmia r3,{r1-r3} |
| subs r0,r1 |
| @ r0: ε Q32 |ε| < 1.17*2^-6 |
| @ r2: sin φ Q31 |
| @ r3: cos φ Q31 |
| asrs r1,r12,#1 |
| bmi 5f @ doing sincosf? |
| asrs r1,#1 |
| bcs 3f @ need cosine? |
| @ here need sine |
| smmulr r1,r0,r0 @ ε² Q32 |
| mov r4,#0x55555555 @ 1/3 Q32 |
| asrs r1,#1 |
| smmlsr r2,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε |
| smmulr r1,r1,r0 @ ε³/2 |
| smmlsr r1,r1,r4,r0 @ ε - ε³/6 |
| |
| smmlar r0,r1,r3,r2 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε) |
| @ the sign of the result is r12b0^r12b2 |
| bl fpack_q31 |
| eors r12,r12,r12,lsr#2 |
| orrs r0,r0,r12,lsl#31 @ insert sign |
| pop {r4-r6,r15} |
| |
| 3: |
| @ here need cosine |
| smmulr r1,r0,r0 @ ε² Q32 |
| mov r4,#0x55555555 @ 1/3 Q32 |
| asrs r1,#1 |
| smmlsr r3,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε |
| smmulr r1,r1,r0 @ ε³/2 |
| smmlsr r1,r1,r4,r0 @ ε - ε³/6 |
| |
| smmlsr r0,r1,r2,r3 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε) |
| @ the sign of the result is r12b2 |
| bl fpack_q31 |
| ands r12,#4 |
| orrs r0,r0,r12,lsl#29 @ insert sign |
| pop {r4-r6,r15} |
| |
| 5: |
| @ here doing sincosf |
| smmulr r1,r0,r0 @ ε² Q32 |
| mov r6,#0x55555555 @ 1/3 Q32 |
| asrs r1,#1 |
| smmlsr r4,r1,r2,r2 @ sin φ - ε²/2*sin φ ~ sin φ cos ε |
| smmlsr r5,r1,r3,r3 @ cos φ - ε²/2*cos φ ~ cos φ cos ε |
| smmulr r1,r1,r0 @ ε³/2 |
| smmlsr r6,r1,r6,r0 @ ε - ε³/6 |
| @ here |
| @ r2 sin φ |
| @ r3 cos φ |
| @ r4 sin φ cos ε |
| @ r5 cos φ cos ε |
| @ r6 ε - ε³/6 ~ sin ε |
| smmlsr r0,r6,r2,r5 @ cos φ cos ε - sin φ (ε - ε³/6) ~ cos (φ+ε) |
| bl fpack_q31 |
| ands r5,r12,#4 |
| eors r0,r0,r5,lsl#29 @ negate cosine in quadrants 2 and 3 |
| push {r0} |
| smmlar r0,r6,r3,r4 @ sin φ cos ε + cos φ (ε - ε³/6) ~ sin (φ+ε) |
| bl fpack_q31 |
| 20: |
| eors r4,r12,r12,lsr#1 |
| eors r4,r4,r12,lsr#2 |
| ands r5,r12,#1<<30 |
| tst r12,#2 @ exchange sine and cosine in odd quadrants |
| beq 1f |
| eors r1,r0,r4,lsl#31 @ negate sine on b0^b1^b2 |
| pop {r0} |
| eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative |
| pop {r4-r6,r15} |
| 1: |
| pop {r1} |
| eors r0,r0,r4,lsl#31 @ negate sine on b0^b1^b2 |
| eors r0,r0,r5,lsl#1 @ negate sine result if argument was negative |
| pop {r4-r6,r15} |
| |
| #endif |