| /* |
| * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause |
| */ |
| |
| #include "pico/asm_helper.S" |
| |
| pico_default_asm_setup |
| |
| .macro double_section name |
| // todo separate flag for shims? |
| #if PICO_DOUBLE_IN_RAM |
| .section RAM_SECTION_NAME(\name), "ax" |
| #else |
| .section SECTION_NAME(\name), "ax" |
| #endif |
| .endm |
| |
| double_section double_table_shim_on_use_helper |
| regular_func double_table_shim_on_use_helper |
| push {r0-r2, lr} |
| mov r0, ip |
| #ifndef NDEBUG |
| // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro |
| cmp r0, #0 |
| bne 1f |
| bkpt #0 |
| #endif |
| 1: |
| ldrh r1, [r0] |
| lsrs r2, r1, #8 |
| adds r0, #2 |
| cmp r2, #0xdf |
| bne 1b |
| uxtb r1, r1 // r1 holds table offset |
| lsrs r2, r0, #2 |
| bcc 1f |
| // unaligned |
| ldrh r2, [r0, #0] |
| ldrh r0, [r0, #2] |
| lsls r0, #16 |
| orrs r0, r2 |
| b 2f |
| 1: |
| ldr r0, [r0] |
| 2: |
| ldr r2, =sd_table |
| str r0, [r2, r1] |
| str r0, [sp, #12] |
| pop {r0-r2, pc} |
| |
| #if PICO_DOUBLE_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED |
| // Note that the V1 ROM has no double support, so this is basically the identical |
| // library, and shim inter-function calls do not bother to redirect back thru the |
| // wrapper functions |
| |
| .equ use_hw_div,1 |
| .equ IOPORT ,0xd0000000 |
| .equ DIV_UDIVIDEND,0x00000060 |
| .equ DIV_UDIVISOR ,0x00000064 |
| .equ DIV_QUOTIENT ,0x00000070 |
| .equ DIV_CSR ,0x00000078 |
| |
| @ Notation: |
| @ rx:ry means the concatenation of rx and ry with rx having the less significant bits |
| |
| .equ debug,0 |
| .macro mdump k |
| .if debug |
| push {r0-r3} |
| push {r14} |
| push {r0-r3} |
| bl osp |
| movs r0,#\k |
| bl o1ch |
| pop {r0-r3} |
| bl dump |
| bl osp |
| bl osp |
| ldr r0,[r13] |
| bl o8hex @ r14 |
| bl onl |
| pop {r0} |
| mov r14,r0 |
| pop {r0-r3} |
| .endif |
| .endm |
| |
| |
| @ IEEE double in ra:rb -> |
| @ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set |
| @ exponent in re |
| @ sign in rs |
| @ trashes rt |
| .macro mdunpack ra,rb,re,rs,rt |
| lsrs \re,\rb,#20 @ extract sign and exponent |
| subs \rs,\re,#1 |
| lsls \rs,#20 |
| subs \rb,\rs @ clear sign and exponent in mantissa; insert implied 1 |
| lsrs \rs,\re,#11 @ sign |
| lsls \re,#21 |
| lsrs \re,#21 @ exponent |
| beq l\@_1 @ zero exponent? |
| adds \rt,\re,#1 |
| lsrs \rt,#11 |
| beq l\@_2 @ exponent != 0x7ff? then done |
| l\@_1: |
| movs \ra,#0 |
| movs \rb,#1 |
| lsls \rb,#20 |
| subs \re,#128 |
| lsls \re,#12 |
| l\@_2: |
| .endm |
| |
| @ IEEE double in ra:rb -> |
| @ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 |
| @ exponent in re |
| @ trashes rt0 and rt1 |
| @ +zero, +denormal -> exponent=-0x80000 |
| @ -zero, -denormal -> exponent=-0x80000 |
| @ +Inf, +NaN -> exponent=+0x77f000 |
| @ -Inf, -NaN -> exponent=+0x77e000 |
| .macro mdunpacks ra,rb,re,rt0,rt1 |
| lsrs \re,\rb,#20 @ extract sign and exponent |
| lsrs \rt1,\rb,#31 @ sign only |
| subs \rt0,\re,#1 |
| lsls \rt0,#20 |
| subs \rb,\rt0 @ clear sign and exponent in mantissa; insert implied 1 |
| lsls \re,#21 |
| bcc l\@_1 @ skip on positive |
| mvns \rb,\rb @ negate mantissa |
| negs \ra,\ra |
| bcc l\@_1 |
| adds \rb,#1 |
| l\@_1: |
| lsrs \re,#21 |
| beq l\@_2 @ zero exponent? |
| adds \rt0,\re,#1 |
| lsrs \rt0,#11 |
| beq l\@_3 @ exponent != 0x7ff? then done |
| subs \re,\rt1 |
| l\@_2: |
| movs \ra,#0 |
| lsls \rt1,#1 @ +ve: 0 -ve: 2 |
| adds \rb,\rt1,#1 @ +ve: 1 -ve: 3 |
| lsls \rb,#30 @ create +/-1 mantissa |
| asrs \rb,#10 |
| subs \re,#128 |
| lsls \re,#12 |
| l\@_3: |
| .endm |
| |
| double_section WRAPPER_FUNC_NAME(__aeabi_dsub) |
| |
| # frsub first because it is the only one that needs alignment |
| regular_func drsub_shim |
| push {r0-r3} |
| pop {r0-r1} |
| pop {r2-r3} |
| // fall thru |
| |
| regular_func dsub_shim |
| push {r4-r7,r14} |
| movs r4,#1 |
| lsls r4,#31 |
| eors r3,r4 @ flip sign on second argument |
| b da_entry @ continue in dadd |
| |
| .align 2 |
| double_section dadd_shim |
| regular_func dadd_shim |
| push {r4-r7,r14} |
| da_entry: |
| mdunpacks r0,r1,r4,r6,r7 |
| mdunpacks r2,r3,r5,r6,r7 |
| subs r7,r5,r4 @ ye-xe |
| subs r6,r4,r5 @ xe-ye |
| bmi da_ygtx |
| @ here xe>=ye: need to shift y down r6 places |
| mov r12,r4 @ save exponent |
| cmp r6,#32 |
| bge da_xrgty @ xe rather greater than ye? |
| adds r7,#32 |
| movs r4,r2 |
| lsls r4,r4,r7 @ rounding bit + sticky bits |
| da_xgty0: |
| movs r5,r3 |
| lsls r5,r5,r7 |
| lsrs r2,r6 |
| asrs r3,r6 |
| orrs r2,r5 |
| da_add: |
| adds r0,r2 |
| adcs r1,r3 |
| da_pack: |
| @ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4 |
| @ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we |
| @ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4 |
| @ to provide a correct continuation of the exact result. |
| @ now pack result back up |
| lsrs r3,r1,#31 @ get sign bit |
| beq 1f @ skip on positive |
| mvns r1,r1 @ negate mantissa |
| mvns r0,r0 |
| movs r2,#0 |
| negs r4,r4 |
| adcs r0,r2 |
| adcs r1,r2 |
| 1: |
| mov r2,r12 @ get exponent |
| lsrs r5,r1,#21 |
| bne da_0 @ shift down required? |
| lsrs r5,r1,#20 |
| bne da_1 @ normalised? |
| cmp r0,#0 |
| beq da_5 @ could mantissa be zero? |
| da_2: |
| adds r4,r4 |
| adcs r0,r0 |
| adcs r1,r1 |
| subs r2,#1 @ adjust exponent |
| lsrs r5,r1,#20 |
| beq da_2 |
| da_1: |
| lsls r4,#1 @ check rounding bit |
| bcc da_3 |
| da_4: |
| adds r0,#1 @ round up |
| bcc 2f |
| adds r1,#1 |
| 2: |
| cmp r4,#0 @ sticky bits zero? |
| bne da_3 |
| lsrs r0,#1 @ round to even |
| lsls r0,#1 |
| da_3: |
| subs r2,#1 |
| bmi da_6 |
| adds r4,r2,#2 @ check if exponent is overflowing |
| lsrs r4,#11 |
| bne da_7 |
| lsls r2,#20 @ pack exponent and sign |
| add r1,r2 |
| lsls r3,#31 |
| add r1,r3 |
| pop {r4-r7,r15} |
| |
| da_7: |
| @ here exponent overflow: return signed infinity |
| lsls r1,r3,#31 |
| ldr r3,=0x7ff00000 |
| orrs r1,r3 |
| b 1f |
| da_6: |
| @ here exponent underflow: return signed zero |
| lsls r1,r3,#31 |
| 1: |
| movs r0,#0 |
| pop {r4-r7,r15} |
| |
| da_5: |
| @ here mantissa could be zero |
| cmp r1,#0 |
| bne da_2 |
| cmp r4,#0 |
| bne da_2 |
| @ inputs must have been of identical magnitude and opposite sign, so return +0 |
| pop {r4-r7,r15} |
| |
| da_0: |
| @ here a shift down by one place is required for normalisation |
| adds r2,#1 @ adjust exponent |
| lsls r6,r0,#31 @ save rounding bit |
| lsrs r0,#1 |
| lsls r5,r1,#31 |
| orrs r0,r5 |
| lsrs r1,#1 |
| cmp r6,#0 |
| beq da_3 |
| b da_4 |
| |
| da_xrgty: @ xe>ye and shift>=32 places |
| cmp r6,#60 |
| bge da_xmgty @ xe much greater than ye? |
| subs r6,#32 |
| adds r7,#64 |
| |
| movs r4,r2 |
| lsls r4,r4,r7 @ these would be shifted off the bottom of the sticky bits |
| beq 1f |
| movs r4,#1 |
| 1: |
| lsrs r2,r2,r6 |
| orrs r4,r2 |
| movs r2,r3 |
| lsls r3,r3,r7 |
| orrs r4,r3 |
| asrs r3,r2,#31 @ propagate sign bit |
| b da_xgty0 |
| |
| da_ygtx: |
| @ here ye>xe: need to shift x down r7 places |
| mov r12,r5 @ save exponent |
| cmp r7,#32 |
| bge da_yrgtx @ ye rather greater than xe? |
| adds r6,#32 |
| movs r4,r0 |
| lsls r4,r4,r6 @ rounding bit + sticky bits |
| da_ygtx0: |
| movs r5,r1 |
| lsls r5,r5,r6 |
| lsrs r0,r7 |
| asrs r1,r7 |
| orrs r0,r5 |
| b da_add |
| |
| da_yrgtx: |
| cmp r7,#60 |
| bge da_ymgtx @ ye much greater than xe? |
| subs r7,#32 |
| adds r6,#64 |
| |
| movs r4,r0 |
| lsls r4,r4,r6 @ these would be shifted off the bottom of the sticky bits |
| beq 1f |
| movs r4,#1 |
| 1: |
| lsrs r0,r0,r7 |
| orrs r4,r0 |
| movs r0,r1 |
| lsls r1,r1,r6 |
| orrs r4,r1 |
| asrs r1,r0,#31 @ propagate sign bit |
| b da_ygtx0 |
| |
| da_ymgtx: @ result is just y |
| movs r0,r2 |
| movs r1,r3 |
| da_xmgty: @ result is just x |
| movs r4,#0 @ clear sticky bits |
| b da_pack |
| |
| .ltorg |
| |
| @ equivalent of UMULL |
| @ needs five temporary registers |
| @ can have rt3==rx, in which case rx trashed |
| @ can have rt4==ry, in which case ry trashed |
| @ can have rzl==rx |
| @ can have rzh==ry |
| @ can have rzl,rzh==rt3,rt4 |
| .macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 |
| @ t0 t1 t2 t3 t4 |
| @ (x) (y) |
| uxth \rt0,\rx @ xl |
| uxth \rt1,\ry @ yl |
| muls \rt0,\rt1 @ xlyl=L |
| lsrs \rt2,\rx,#16 @ xh |
| muls \rt1,\rt2 @ xhyl=M0 |
| lsrs \rt4,\ry,#16 @ yh |
| muls \rt2,\rt4 @ xhyh=H |
| uxth \rt3,\rx @ xl |
| muls \rt3,\rt4 @ xlyh=M1 |
| adds \rt1,\rt3 @ M0+M1=M |
| bcc l\@_1 @ addition of the two cross terms can overflow, so add carry into H |
| movs \rt3,#1 @ 1 |
| lsls \rt3,#16 @ 0x10000 |
| adds \rt2,\rt3 @ H' |
| l\@_1: |
| @ t0 t1 t2 t3 t4 |
| @ (zl) (zh) |
| lsls \rzl,\rt1,#16 @ ML |
| lsrs \rzh,\rt1,#16 @ MH |
| adds \rzl,\rt0 @ ZL |
| adcs \rzh,\rt2 @ ZH |
| .endm |
| |
| @ SUMULL: x signed, y unsigned |
| @ in table below ¯ means signed variable |
| @ needs five temporary registers |
| @ can have rt3==rx, in which case rx trashed |
| @ can have rt4==ry, in which case ry trashed |
| @ can have rzl==rx |
| @ can have rzh==ry |
| @ can have rzl,rzh==rt3,rt4 |
| .macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 |
| @ t0 t1 t2 t3 t4 |
| @ ¯(x) (y) |
| uxth \rt0,\rx @ xl |
| uxth \rt1,\ry @ yl |
| muls \rt0,\rt1 @ xlyl=L |
| asrs \rt2,\rx,#16 @ ¯xh |
| muls \rt1,\rt2 @ ¯xhyl=M0 |
| lsrs \rt4,\ry,#16 @ yh |
| muls \rt2,\rt4 @ ¯xhyh=H |
| uxth \rt3,\rx @ xl |
| muls \rt3,\rt4 @ xlyh=M1 |
| asrs \rt4,\rt1,#31 @ M0sx (M1 sign extension is zero) |
| adds \rt1,\rt3 @ M0+M1=M |
| movs \rt3,#0 @ 0 |
| adcs \rt4,\rt3 @ ¯Msx |
| lsls \rt4,#16 @ ¯Msx<<16 |
| adds \rt2,\rt4 @ H' |
| |
| @ t0 t1 t2 t3 t4 |
| @ (zl) (zh) |
| lsls \rzl,\rt1,#16 @ M~ |
| lsrs \rzh,\rt1,#16 @ M~ |
| adds \rzl,\rt0 @ ZL |
| adcs \rzh,\rt2 @ ¯ZH |
| .endm |
| |
| @ SSMULL: x signed, y signed |
| @ in table below ¯ means signed variable |
| @ needs five temporary registers |
| @ can have rt3==rx, in which case rx trashed |
| @ can have rt4==ry, in which case ry trashed |
| @ can have rzl==rx |
| @ can have rzh==ry |
| @ can have rzl,rzh==rt3,rt4 |
| .macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 |
| @ t0 t1 t2 t3 t4 |
| @ ¯(x) (y) |
| uxth \rt0,\rx @ xl |
| uxth \rt1,\ry @ yl |
| muls \rt0,\rt1 @ xlyl=L |
| asrs \rt2,\rx,#16 @ ¯xh |
| muls \rt1,\rt2 @ ¯xhyl=M0 |
| asrs \rt4,\ry,#16 @ ¯yh |
| muls \rt2,\rt4 @ ¯xhyh=H |
| uxth \rt3,\rx @ xl |
| muls \rt3,\rt4 @ ¯xlyh=M1 |
| adds \rt1,\rt3 @ ¯M0+M1=M |
| asrs \rt3,\rt1,#31 @ Msx |
| bvc l\@_1 @ |
| mvns \rt3,\rt3 @ ¯Msx flip sign extension bits if overflow |
| l\@_1: |
| lsls \rt3,#16 @ ¯Msx<<16 |
| adds \rt2,\rt3 @ H' |
| |
| @ t0 t1 t2 t3 t4 |
| @ (zl) (zh) |
| lsls \rzl,\rt1,#16 @ M~ |
| lsrs \rzh,\rt1,#16 @ M~ |
| adds \rzl,\rt0 @ ZL |
| adcs \rzh,\rt2 @ ¯ZH |
| .endm |
| |
| @ can have rt2==rx, in which case rx trashed |
| @ can have rzl==rx |
| @ can have rzh==rt1 |
| .macro square32_64 rx,rzl,rzh,rt0,rt1,rt2 |
| @ t0 t1 t2 zl zh |
| uxth \rt0,\rx @ xl |
| muls \rt0,\rt0 @ xlxl=L |
| uxth \rt1,\rx @ xl |
| lsrs \rt2,\rx,#16 @ xh |
| muls \rt1,\rt2 @ xlxh=M |
| muls \rt2,\rt2 @ xhxh=H |
| lsls \rzl,\rt1,#17 @ ML |
| lsrs \rzh,\rt1,#15 @ MH |
| adds \rzl,\rt0 @ ZL |
| adcs \rzh,\rt2 @ ZH |
| .endm |
| |
| double_section dmul_shim |
| regular_func dmul_shim |
| push {r4-r7,r14} |
| mdunpack r0,r1,r4,r6,r5 |
| mov r12,r4 |
| mdunpack r2,r3,r4,r7,r5 |
| eors r7,r6 @ sign of result |
| add r4,r12 @ exponent of result |
| push {r0-r2,r4,r7} |
| |
| @ accumulate full product in r12:r5:r6:r7 |
| mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5 @ XL*YL |
| mov r12,r0 @ save LL bits |
| |
| mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7 @ XH*YH |
| |
| pop {r0} @ XL |
| mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3 @ XL*YH |
| adds r5,r0 |
| adcs r6,r3 |
| movs r0,#0 |
| adcs r7,r0 |
| |
| pop {r1,r2} @ XH,YL |
| mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2 @ XH*YL |
| adds r5,r1 |
| adcs r6,r2 |
| movs r0,#0 |
| adcs r7,r0 |
| |
| @ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12 |
| pop {r3,r4} @ exponent in r3, sign in r4 |
| lsls r1,r7,#11 |
| lsrs r2,r6,#21 |
| orrs r1,r2 |
| lsls r0,r6,#11 |
| lsrs r2,r5,#21 |
| orrs r0,r2 |
| lsls r5,#11 @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12 |
| lsrs r2,r1,#20 |
| bne 1f @ skip if in range [2..4) |
| adds r5,r5 @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32) |
| adcs r0,r0 |
| adcs r1,r1 |
| subs r3,#1 @ correct exponent |
| 1: |
| ldr r6,=0x3ff |
| subs r3,r6 @ correct for exponent bias |
| lsls r6,#1 @ 0x7fe |
| cmp r3,r6 |
| bhs dm_0 @ exponent over- or underflow |
| lsls r5,#1 @ rounding bit to carry |
| bcc 1f @ result is correctly rounded |
| adds r0,#1 |
| movs r6,#0 |
| adcs r1,r6 @ round up |
| mov r6,r12 @ remaining sticky bits |
| orrs r5,r6 |
| bne 1f @ some sticky bits set? |
| lsrs r0,#1 |
| lsls r0,#1 @ round to even |
| 1: |
| lsls r3,#20 |
| adds r1,r3 |
| dm_2: |
| lsls r4,#31 |
| add r1,r4 |
| pop {r4-r7,r15} |
| |
| @ here for exponent over- or underflow |
| dm_0: |
| bge dm_1 @ overflow? |
| adds r3,#1 @ would-be zero exponent? |
| bne 1f |
| adds r0,#1 |
| bne 1f @ all-ones mantissa? |
| adds r1,#1 |
| lsrs r7,r1,#21 |
| beq 1f |
| lsrs r1,#1 |
| b dm_2 |
| 1: |
| lsls r1,r4,#31 |
| movs r0,#0 |
| pop {r4-r7,r15} |
| |
| @ here for exponent overflow |
| dm_1: |
| adds r6,#1 @ 0x7ff |
| lsls r1,r6,#20 |
| movs r0,#0 |
| b dm_2 |
| |
| .ltorg |
| |
| @ Approach to division y/x is as follows. |
| @ |
| @ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top |
| @ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so). |
| @ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction |
| @ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact |
| @ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to |
| @ round up or down. |
| @ |
| @ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively |
| @ that | u1*x-1 | < 10*2^-32. |
| @ |
| @ More precisely: |
| @ |
| @ x0=(q16)x; |
| @ x1=(q30)x; |
| @ y0=(q31)y; |
| @ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation |
| @ v=(q30)(u0*x1-1); |
| @ u1=(q30)u0-(q30~)(u0*v); |
| @ |
| @ a0=(q30)(u1*y0); |
| @ r0=(q82)y-a0*x; |
| @ r0x=(q57)r0; |
| @ d0=r0x*u1; |
| @ a1=d0+a0; |
| @ |
| @ Error analysis |
| @ |
| @ Use Greek letters to represent the errors introduced by rounding and truncation. |
| @ |
| @ r₀ = y - a₀x |
| @ = y - [ u₁ ( y - α ) - β ] x where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30 |
| @ = y ( 1 - u₁x ) + ( u₁α + β ) x |
| @ |
| @ Hence |
| @ |
| @ | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30 |
| @ = 26*2^-32 |
| @ |
| @ r₁ = y - a₁x |
| @ = y - a₀x - d₀x |
| @ = r₀ - d₀x |
| @ = r₀ - u₁ ( r₀ - γ ) x where 0 ≤ γ < 2^-57 |
| @ = r₀ ( 1 - u₁x ) + u₁γx |
| @ |
| @ Hence |
| @ |
| @ | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57 |
| @ = (260+128)*2^-64 |
| @ < 2^-55 |
| @ |
| @ Empirically it seems to be nearly twice as good as this. |
| @ |
| @ To determine correctly whether the exact remainder calculation can be skipped we need a result |
| @ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation |
| @ and so 1ulp is 2^-53 and so the calculation above suffices. |
| |
| double_section ddiv_shim |
| regular_func ddiv_shim |
| push {r4-r7,r14} |
| ddiv0: @ entry point from dtan |
| mdunpack r2,r3,r4,r7,r6 @ unpack divisor |
| |
| .if use_hw_div |
| |
| movs r5,#IOPORT>>24 |
| lsls r5,#24 |
| movs r6,#0 |
| mvns r6,r6 |
| str r6,[r5,#DIV_UDIVIDEND] |
| lsrs r6,r3,#4 @ x0=(q16)x |
| str r6,[r5,#DIV_UDIVISOR] |
| @ if there are not enough cycles from now to the read of the quotient for |
| @ the divider to do its stuff we need a busy-wait here |
| |
| .endif |
| |
| @ unpack dividend by hand to save on register use |
| lsrs r6,r1,#31 |
| adds r6,r7 |
| mov r12,r6 @ result sign in r12b0; r12b1 trashed |
| lsls r1,#1 |
| lsrs r7,r1,#21 @ exponent |
| beq 1f @ zero exponent? |
| adds r6,r7,#1 |
| lsrs r6,#11 |
| beq 2f @ exponent != 0x7ff? then done |
| 1: |
| movs r0,#0 |
| movs r1,#0 |
| subs r7,#64 @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct |
| lsls r7,#12 |
| 2: |
| subs r6,r7,r4 |
| lsls r6,#2 |
| add r12,r12,r6 @ (signed) exponent in r12[31..8] |
| subs r7,#1 @ implied 1 |
| lsls r7,#21 |
| subs r1,r7 |
| lsrs r1,#1 |
| |
| .if use_hw_div |
| |
| ldr r6,[r5,#DIV_QUOTIENT] |
| adds r6,#1 |
| lsrs r6,#1 |
| |
| .else |
| |
| @ this is not beautiful; could be replaced by better code that uses knowledge of divisor range |
| push {r0-r3} |
| movs r0,#0 |
| mvns r0,r0 |
| lsrs r1,r3,#4 @ x0=(q16)x |
| bl __aeabi_uidiv @ !!! this could (but apparently does not) trash R12 |
| adds r6,r0,#1 |
| lsrs r6,#1 |
| pop {r0-r3} |
| |
| .endif |
| |
| @ here |
| @ r0:r1 y mantissa |
| @ r2:r3 x mantissa |
| @ r6 u0, first approximation to 1/x Q15 |
| @ r12: result sign, exponent |
| |
| lsls r4,r3,#10 |
| lsrs r5,r2,#22 |
| orrs r5,r4 @ x1=(q30)x |
| muls r5,r6 @ u0*x1 Q45 |
| asrs r5,#15 @ v=u0*x1-1 Q30 |
| muls r5,r6 @ u0*v Q45 |
| asrs r5,#14 |
| adds r5,#1 |
| asrs r5,#1 @ round u0*v to Q30 |
| lsls r6,#15 |
| subs r6,r5 @ u1 Q30 |
| |
| @ here |
| @ r0:r1 y mantissa |
| @ r2:r3 x mantissa |
| @ r6 u1, second approximation to 1/x Q30 |
| @ r12: result sign, exponent |
| |
| push {r2,r3} |
| lsls r4,r1,#11 |
| lsrs r5,r0,#21 |
| orrs r4,r5 @ y0=(q31)y |
| mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5 @ y0*u1 Q61 |
| adds r4,r4 |
| adcs r5,r5 @ a0=(q30)(y0*u1) |
| |
| @ here |
| @ r0:r1 y mantissa |
| @ r5 a0, first approximation to y/x Q30 |
| @ r6 u1, second approximation to 1/x Q30 |
| @ r12 result sign, exponent |
| |
| ldr r2,[r13,#0] @ xL |
| mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3 @ xL*a0 |
| ldr r4,[r13,#4] @ xH |
| muls r4,r5 @ xH*a0 |
| adds r3,r4 @ r2:r3 now x*a0 Q82 |
| lsrs r2,#25 |
| lsls r1,r3,#7 |
| orrs r2,r1 @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89 |
| lsls r4,r0,#5 @ y Q57 |
| subs r0,r4,r2 @ r0x=y-x*a0 Q57 (signed) |
| |
| @ here |
| @ r0 r0x Q57 |
| @ r5 a0, first approximation to y/x Q30 |
| @ r4 yL Q57 |
| @ r6 u1 Q30 |
| @ r12 result sign, exponent |
| |
| muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6 @ r7:r6 r0x*u1 Q87 |
| asrs r3,r6,#25 |
| adds r5,r3 |
| lsls r3,r6,#7 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point) |
| @ here we could recover another 7 bits of precision (but not accuracy) from the top of r7 |
| @ but these bits are thrown away in the rounding and conversion to Q52 below |
| |
| @ here |
| @ r3:r5 a1 Q62 candidate quotient [0.5,2) or so |
| @ r4 yL Q57 |
| @ r12 result sign, exponent |
| |
| movs r6,#0 |
| adds r3,#128 @ for initial rounding to Q53 |
| adcs r5,r5,r6 |
| lsrs r1,r5,#30 |
| bne dd_0 |
| @ here candidate quotient a1 is in range [0.5,1) |
| @ so 30 significant bits in r5 |
| |
| lsls r4,#1 @ y now Q58 |
| lsrs r1,r5,#9 @ to Q52 |
| lsls r0,r5,#23 |
| lsrs r3,#9 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result |
| orrs r0,r3 |
| bcs dd_1 |
| b dd_2 |
| dd_0: |
| @ here candidate quotient a1 is in range [1,2) |
| @ so 31 significant bits in r5 |
| |
| movs r2,#4 |
| add r12,r12,r2 @ fix exponent; r3:r5 now effectively Q61 |
| adds r3,#128 @ complete rounding to Q53 |
| adcs r5,r5,r6 |
| lsrs r1,r5,#10 |
| lsls r0,r5,#22 |
| lsrs r3,#10 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result |
| orrs r0,r3 |
| bcc dd_2 |
| dd_1: |
| |
| @ here |
| @ r0:r1 rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest |
| @ r4 yL Q58 or Q57 |
| @ r12 result sign, exponent |
| @ carry set |
| |
| adcs r0,r0,r0 |
| adcs r1,r1,r1 @ z Q53 with 1 in LSB |
| lsls r4,#16 @ Q105-32=Q73 |
| ldr r2,[r13,#0] @ xL Q52 |
| ldr r3,[r13,#4] @ xH Q20 |
| |
| movs r5,r1 @ zH Q21 |
| muls r5,r2 @ zH*xL Q73 |
| subs r4,r5 |
| muls r3,r0 @ zL*xH Q73 |
| subs r4,r3 |
| mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3 @ xL*zL |
| negs r2,r2 @ borrow from low half? |
| sbcs r4,r3 @ y-xz Q73 (remainder bits 52..73) |
| |
| cmp r4,#0 |
| |
| bmi 1f |
| movs r2,#0 @ round up |
| adds r0,#1 |
| adcs r1,r2 |
| 1: |
| lsrs r0,#1 @ shift back down to Q52 |
| lsls r2,r1,#31 |
| orrs r0,r2 |
| lsrs r1,#1 |
| dd_2: |
| add r13,#8 |
| mov r2,r12 |
| lsls r7,r2,#31 @ result sign |
| asrs r2,#2 @ result exponent |
| ldr r3,=0x3fd |
| adds r2,r3 |
| ldr r3,=0x7fe |
| cmp r2,r3 |
| bhs dd_3 @ over- or underflow? |
| lsls r2,#20 |
| adds r1,r2 @ pack exponent |
| dd_5: |
| adds r1,r7 @ pack sign |
| pop {r4-r7,r15} |
| |
| dd_3: |
| movs r0,#0 |
| cmp r2,#0 |
| bgt dd_4 @ overflow? |
| movs r1,r7 |
| pop {r4-r7,r15} |
| |
| dd_4: |
| adds r3,#1 @ 0x7ff |
| lsls r1,r3,#20 |
| b dd_5 |
| |
| .section SECTION_NAME(dsqrt_shim) |
| /* |
| Approach to square root x=sqrt(y) is as follows. |
| |
| First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y |
| to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because |
| d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is |
| a better approximation to sqrt(y). If this is near a rounding boundary we |
| compute an exact remainder y-a5*a5 to decide whether to round up or down. |
| |
| The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively |
| that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32. |
| |
| More precisely, with "y" representing y truncated to 30 binary places: |
| |
| u=(q3)y; // 24-entry table |
| a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u); // first approximation from table |
| p0=(q16)(a0*a0) * (q16)y; |
| r0=(q20)(p0-1); |
| dy0=(q15)(r0*a0); // Newton-Raphson correction term |
| a1=(q16)a0-dy0/2; // good to ~9 bits |
| |
| p1=(q19)(a1*a1)*(q19)y; |
| r1=(q23)(p1-1); |
| dy1=(q15~)(r1*a1); // second Newton-Raphson correction |
| a2x=(q16)a1-dy1/2; // good to ~16 bits |
| a2=a2x-a2x/1t16; // prevent overflow of a2*a2 in 32 bits |
| |
| p2=(a2*a2)*(q30)y; // Q62 |
| r2=(q36)(p2-1+1t-31); |
| dy2=(q30)(r2*a2); // Q52->Q30 |
| a3=(q31)a2-dy2/2; // good to about 30 bits |
| a4=(q30)(a3*(q30)y+1t-31); // good to about 28 bits |
| |
| Error analysis |
| |
| r₄ = y - a₄² |
| d₄ = 1/2 a₃r₄ |
| a₅ = a₄ + d₄ |
| r₅ = y - a₅² |
| = y - ( a₄ + d₄ )² |
| = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄² |
| = r₄ - a₃a₄r₄ - 1/4 a₃²r₄² |
| |
| | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄² |
| |
| a₅ = √y √( 1 - r₅/y ) |
| = √y ( 1 - 1/2 r₅/y + ... ) |
| |
| So to first order (second order being very tiny) |
| |
| √y - a₅ = 1/2 r₅/y |
| |
| and |
| |
| | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y ) |
| |
| From dsqrttest.c (conservatively): |
| |
| < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 ) |
| = 1/2 ( 200 + 200 ) * 2^-64 |
| < 2^-56 |
| |
| Empirically we see about 1ulp worst-case error including rounding at Q57. |
| |
| To determine correctly whether the exact remainder calculation can be skipped we need a result |
| accurate to < 0.25ulp at Q52, or 2^-54. |
| */ |
| |
| dq_2: |
| bge dq_3 @ +Inf? |
| movs r1,#0 |
| b dq_4 |
| |
| dq_0: |
| lsrs r1,#31 |
| lsls r1,#31 @ preserve sign bit |
| lsrs r2,#21 @ extract exponent |
| beq dq_4 @ -0? return it |
| asrs r1,#11 @ make -Inf |
| b dq_4 |
| |
| dq_3: |
| ldr r1,=0x7ff |
| lsls r1,#20 @ return +Inf |
| dq_4: |
| movs r0,#0 |
| dq_1: |
| bx r14 |
| |
| .align 2 |
| regular_func dsqrt_shim |
| lsls r2,r1,#1 |
| bcs dq_0 @ negative? |
| lsrs r2,#21 @ extract exponent |
| subs r2,#1 |
| ldr r3,=0x7fe |
| cmp r2,r3 |
| bhs dq_2 @ catches 0 and +Inf |
| push {r4-r7,r14} |
| lsls r4,r2,#20 |
| subs r1,r4 @ insert implied 1 |
| lsrs r2,#1 |
| bcc 1f @ even exponent? skip |
| adds r0,r0,r0 @ odd exponent: shift up mantissa |
| adcs r1,r1,r1 |
| 1: |
| lsrs r3,#2 |
| adds r2,r3 |
| lsls r2,#20 |
| mov r12,r2 @ save result exponent |
| |
| @ here |
| @ r0:r1 y mantissa Q52 [1,4) |
| @ r12 result exponent |
| .equ drsqrtapp_minus_8, (drsqrtapp-8) |
| adr r4,drsqrtapp_minus_8 @ first eight table entries are never accessed because of the mantissa's leading 1 |
| lsrs r2,r1,#17 @ y Q3 |
| ldrb r2,[r4,r2] @ initial approximation to reciprocal square root a0 Q8 |
| lsrs r3,r1,#4 @ first Newton-Raphson iteration |
| muls r3,r2 |
| muls r3,r2 @ i32 p0=a0*a0*(y>>14); // Q32 |
| asrs r3,r3,#12 @ i32 r0=p0>>12; // Q20 |
| muls r3,r2 |
| asrs r3,#13 @ i32 dy0=(r0*a0)>>13; // Q15 |
| lsls r2,#8 |
| subs r2,r3 @ i32 a1=(a0<<8)-dy0; // Q16 |
| |
| movs r3,r2 |
| muls r3,r3 |
| lsrs r3,#13 |
| lsrs r4,r1,#1 |
| muls r3,r4 @ i32 p1=((a1*a1)>>11)*(y>>11); // Q19*Q19=Q38 |
| asrs r3,#15 @ i32 r1=p1>>15; // Q23 |
| muls r3,r2 |
| asrs r3,#23 |
| adds r3,#1 |
| asrs r3,#1 @ i32 dy1=(r1*a1+(1<<23))>>24; // Q23*Q16=Q39; Q15 |
| subs r2,r3 @ i32 a2=a1-dy1; // Q16 |
| lsrs r3,r2,#16 |
| subs r2,r3 @ if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2 |
| |
| @ here |
| @ r0:r1 y mantissa |
| @ r2 a2 ~ 1/sqrt(y) Q16 |
| @ r12 result exponent |
| |
| movs r3,r2 |
| muls r3,r3 |
| lsls r1,#10 |
| lsrs r4,r0,#22 |
| orrs r1,r4 @ y Q30 |
| mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3 @ i64 p2=(ui64)(a2*a2)*(ui64)y; // Q62 r4:r3 |
| lsls r5,r3,#6 |
| lsrs r4,#26 |
| orrs r4,r5 |
| adds r4,#0x20 @ i32 r2=(p2>>26)+0x20; // Q36 r4 |
| uxth r5,r4 |
| muls r5,r2 |
| asrs r4,#16 |
| muls r4,r2 |
| lsrs r5,#16 |
| adds r4,r5 |
| asrs r4,#6 @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30 |
| lsls r2,#15 |
| subs r2,r4 |
| |
| @ here |
| @ r0 y low bits |
| @ r1 y Q30 |
| @ r2 a3 ~ 1/sqrt(y) Q31 |
| @ r12 result exponent |
| |
| mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4 |
| adds r3,r3,r3 |
| adcs r4,r4,r4 |
| adds r3,r3,r3 |
| movs r3,#0 |
| adcs r3,r4 @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30 |
| |
| @ here |
| @ r0 y low bits |
| @ r1 y Q30 |
| @ r2 a3 Q31 ~ 1/sqrt(y) |
| @ r3 a4 Q30 ~ sqrt(y) |
| @ r12 result exponent |
| |
| square32_64 r3, r4,r5, r6,r5,r7 |
| lsls r6,r0,#8 |
| lsrs r7,r1,#2 |
| subs r6,r4 |
| sbcs r7,r5 @ r4=(q60)y-a4*a4 |
| |
| @ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60 |
| |
| lsls r5,r7,#29 |
| lsrs r6,#3 |
| adcs r6,r5 @ r4 Q57 with rounding |
| muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2 @ d4=a3*r4/2 Q89 |
| @ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30 |
| |
| adds r2,#8 |
| asrs r2,#5 @ d4 Q52, rounded to Q53 with spare bit in carry |
| |
| @ here |
| @ r0 y low bits |
| @ r1 y Q30 |
| @ r2 d4 Q52, rounded to Q53 |
| @ C flag contains d4_b53 |
| @ r3 a4 Q30 |
| |
| bcs dq_5 |
| |
| lsrs r5,r3,#10 @ a4 Q52 |
| lsls r4,r3,#22 |
| |
| asrs r1,r2,#31 |
| adds r0,r2,r4 |
| adcs r1,r5 @ a4+d4 |
| |
| add r1,r12 @ pack exponent |
| pop {r4-r7,r15} |
| |
| .ltorg |
| |
| |
| @ round(sqrt(2^22./[68:8:252])) |
| drsqrtapp: |
| .byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8 |
| .byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95 |
| .byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81 |
| |
| dq_5: |
| @ here we are near a rounding boundary, C is set |
| adcs r2,r2,r2 @ d4 Q53+1ulp |
| lsrs r5,r3,#9 |
| lsls r4,r3,#23 @ r4:r5 a4 Q53 |
| asrs r1,r2,#31 |
| adds r4,r2,r4 |
| adcs r5,r1 @ r4:r5 a5=a4+d4 Q53+1ulp |
| movs r3,r5 |
| muls r3,r4 |
| square32_64 r4,r1,r2,r6,r2,r7 |
| adds r2,r3 |
| adds r2,r3 @ r1:r2 a5^2 Q106 |
| lsls r0,#22 @ y Q84 |
| |
| negs r1,r1 |
| sbcs r0,r2 @ remainder y-a5^2 |
| bmi 1f @ y<a5^2: no need to increment a5 |
| movs r3,#0 |
| adds r4,#1 |
| adcs r5,r3 @ bump a5 if over rounding boundary |
| 1: |
| lsrs r0,r4,#1 |
| lsrs r1,r5,#1 |
| lsls r5,#31 |
| orrs r0,r5 |
| add r1,r12 |
| pop {r4-r7,r15} |
| |
| @ "scientific" functions start here |
| |
| @ double-length CORDIC rotation step |
| |
| @ r0:r1 ω |
| @ r6 32-i (complementary shift) |
| @ r7 i (shift) |
| @ r8:r9 x |
| @ r10:r11 y |
| @ r12 coefficient pointer |
| |
| @ an option in rotation mode would be to compute the sequence of σ values |
| @ in one pass, rotate the initial vector by the residual ω and then run a |
| @ second pass to compute the final x and y. This would relieve pressure |
| @ on registers and hence possibly be faster. The same trick does not work |
| @ in vectoring mode (but perhaps one could work to single precision in |
| @ a first pass and then double precision in a second pass?). |
| |
| double_section dcordic_vec_step |
| regular_func dcordic_vec_step |
| mov r2,r12 |
| ldmia r2!,{r3,r4} |
| mov r12,r2 |
| mov r2,r11 |
| cmp r2,#0 |
| blt 1f |
| b 2f |
| |
| double_section dcordic_rot_step |
| regular_func dcordic_rot_step |
| mov r2,r12 |
| ldmia r2!,{r3,r4} |
| mov r12,r2 |
| cmp r1,#0 |
| bge 1f |
| 2: |
| @ ω<0 / y>=0 |
| @ ω+=dω |
| @ x+=y>>i, y-=x>>i |
| adds r0,r3 |
| adcs r1,r4 |
| |
| mov r3,r11 |
| asrs r3,r7 |
| mov r4,r11 |
| lsls r4,r6 |
| mov r2,r10 |
| lsrs r2,r7 |
| orrs r2,r4 @ r2:r3 y>>i, rounding in carry |
| mov r4,r8 |
| mov r5,r9 @ r4:r5 x |
| adcs r2,r4 |
| adcs r3,r5 @ r2:r3 x+(y>>i) |
| mov r8,r2 |
| mov r9,r3 |
| |
| mov r3,r5 |
| lsls r3,r6 |
| asrs r5,r7 |
| lsrs r4,r7 |
| orrs r4,r3 @ r4:r5 x>>i, rounding in carry |
| mov r2,r10 |
| mov r3,r11 |
| sbcs r2,r4 |
| sbcs r3,r5 @ r2:r3 y-(x>>i) |
| mov r10,r2 |
| mov r11,r3 |
| bx r14 |
| |
| |
| @ ω>0 / y<0 |
| @ ω-=dω |
| @ x-=y>>i, y+=x>>i |
| 1: |
| subs r0,r3 |
| sbcs r1,r4 |
| |
| mov r3,r9 |
| asrs r3,r7 |
| mov r4,r9 |
| lsls r4,r6 |
| mov r2,r8 |
| lsrs r2,r7 |
| orrs r2,r4 @ r2:r3 x>>i, rounding in carry |
| mov r4,r10 |
| mov r5,r11 @ r4:r5 y |
| adcs r2,r4 |
| adcs r3,r5 @ r2:r3 y+(x>>i) |
| mov r10,r2 |
| mov r11,r3 |
| |
| mov r3,r5 |
| lsls r3,r6 |
| asrs r5,r7 |
| lsrs r4,r7 |
| orrs r4,r3 @ r4:r5 y>>i, rounding in carry |
| mov r2,r8 |
| mov r3,r9 |
| sbcs r2,r4 |
| sbcs r3,r5 @ r2:r3 x-(y>>i) |
| mov r8,r2 |
| mov r9,r3 |
| bx r14 |
| |
| @ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf |
| @ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works |
| |
| double_section double2int_shim |
| regular_func double2int_shim |
| movs r2,#0 @ and fall through |
| regular_func double2fix_shim |
| push {r14} |
| adds r2,#32 |
| bl double2fix64_shim |
| movs r0,r1 |
| pop {r15} |
| |
| double_section double2uint_shim |
| regular_func double2uint_shim |
| movs r2,#0 @ and fall through |
| regular_func double2ufix_shim |
| push {r14} |
| adds r2,#32 |
| bl double2ufix64_shim |
| movs r0,r1 |
| pop {r15} |
| |
| double_section double2int64_shim |
| regular_func double2int64_shim |
| movs r2,#0 @ and fall through |
| regular_func double2fix64_shim |
| push {r14} |
| bl d2fix |
| |
| asrs r2,r1,#31 |
| cmp r2,r3 |
| bne 1f @ sign extension bits fail to match sign of result? |
| pop {r15} |
| 1: |
| mvns r0,r3 |
| movs r1,#1 |
| lsls r1,#31 |
| eors r1,r1,r0 @ generate extreme fixed-point values |
| pop {r15} |
| |
| double_section double2uint64_shim |
| regular_func double2uint64_shim |
| movs r2,#0 @ and fall through |
| regular_func double2ufix64_shim |
| asrs r3,r1,#20 @ negative? return 0 |
| bmi ret_dzero |
| @ and fall through |
| |
| @ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf |
| @ result clamped so that r3 can only be 0 or -1 |
| @ trashes r12 |
| .thumb_func |
| d2fix: |
| push {r4,r14} |
| mov r12,r2 |
| bl dunpacks |
| asrs r4,r2,#16 |
| adds r4,#1 |
| bge 1f |
| movs r1,#0 @ -0 -> +0 |
| 1: |
| asrs r3,r1,#31 |
| ldr r4, =d2fix_a |
| bx r4 |
| |
| ret_dzero: |
| movs r0,#0 |
| movs r1,#0 |
| bx r14 |
| |
| .weak d2fix_a // weak because it exists in float code too |
| .thumb_func |
| d2fix_a: |
| @ here |
| @ r0:r1 two's complement mantissa |
| @ r2 unbaised exponent |
| @ r3 mantissa sign extension bits |
| add r2,r12 @ exponent plus offset for required binary point position |
| subs r2,#52 @ required shift |
| bmi 1f @ shift down? |
| @ here a shift up by r2 places |
| cmp r2,#12 @ will clamp? |
| bge 2f |
| movs r4,r0 |
| lsls r1,r2 |
| lsls r0,r2 |
| negs r2,r2 |
| adds r2,#32 @ complementary shift |
| lsrs r4,r2 |
| orrs r1,r4 |
| pop {r4,r15} |
| 2: |
| mvns r0,r3 |
| mvns r1,r3 @ overflow: clamp to extreme fixed-point values |
| pop {r4,r15} |
| 1: |
| @ here a shift down by -r2 places |
| adds r2,#32 |
| bmi 1f @ long shift? |
| mov r4,r1 |
| lsls r4,r2 |
| negs r2,r2 |
| adds r2,#32 @ complementary shift |
| asrs r1,r2 |
| lsrs r0,r2 |
| orrs r0,r4 |
| pop {r4,r15} |
| 1: |
| @ here a long shift down |
| movs r0,r1 |
| asrs r1,#31 @ shift down 32 places |
| adds r2,#32 |
| bmi 1f @ very long shift? |
| negs r2,r2 |
| adds r2,#32 |
| asrs r0,r2 |
| pop {r4,r15} |
| 1: |
| movs r0,r3 @ result very near zero: use sign extension bits |
| movs r1,r3 |
| pop {r4,r15} |
| |
| double_section double2float_shim |
| regular_func double2float_shim |
| lsls r2,r1,#1 |
| lsrs r2,#21 @ exponent |
| ldr r3,=0x3ff-0x7f |
| subs r2,r3 @ fix exponent bias |
| ble 1f @ underflow or zero |
| cmp r2,#0xff |
| bge 2f @ overflow or infinity |
| lsls r2,#23 @ position exponent of result |
| lsrs r3,r1,#31 |
| lsls r3,#31 |
| orrs r2,r3 @ insert sign |
| lsls r3,r0,#3 @ rounding bits |
| lsrs r0,#29 |
| lsls r1,#12 |
| lsrs r1,#9 |
| orrs r0,r1 @ assemble mantissa |
| orrs r0,r2 @ insert exponent and sign |
| lsls r3,#1 |
| bcc 3f @ no rounding |
| beq 4f @ all sticky bits 0? |
| 5: |
| adds r0,#1 |
| 3: |
| bx r14 |
| 4: |
| lsrs r3,r0,#1 @ odd? then round up |
| bcs 5b |
| bx r14 |
| 1: |
| beq 6f @ check case where value is just less than smallest normal |
| 7: |
| lsrs r0,r1,#31 |
| lsls r0,#31 |
| bx r14 |
| 6: |
| lsls r2,r1,#12 @ 20 1:s at top of mantissa? |
| asrs r2,#12 |
| adds r2,#1 |
| bne 7b |
| lsrs r2,r0,#29 @ and 3 more 1:s? |
| cmp r2,#7 |
| bne 7b |
| movs r2,#1 @ return smallest normal with correct sign |
| b 8f |
| 2: |
| movs r2,#0xff |
| 8: |
| lsrs r0,r1,#31 @ return signed infinity |
| lsls r0,#8 |
| adds r0,r2 |
| lsls r0,#23 |
| bx r14 |
| |
| double_section x2double_shims |
| @ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding |
| |
| .align 2 |
| regular_func uint2double_shim |
| movs r1,#0 @ and fall through |
| regular_func ufix2double_shim |
| movs r2,r1 |
| movs r1,#0 |
| b ufix642double_shim |
| |
| .align 2 |
| regular_func int2double_shim |
| movs r1,#0 @ and fall through |
| regular_func fix2double_shim |
| movs r2,r1 |
| asrs r1,r0,#31 @ sign extend |
| b fix642double_shim |
| |
| .align 2 |
| regular_func uint642double_shim |
| movs r2,#0 @ and fall through |
| regular_func ufix642double_shim |
| movs r3,#0 |
| b uf2d |
| |
| .align 2 |
| regular_func int642double_shim |
| movs r2,#0 @ and fall through |
| regular_func fix642double_shim |
| asrs r3,r1,#31 @ sign bit across all bits |
| eors r0,r3 |
| eors r1,r3 |
| subs r0,r3 |
| sbcs r1,r3 |
| uf2d: |
| push {r4,r5,r14} |
| ldr r4,=0x432 |
| subs r2,r4,r2 @ form biased exponent |
| @ here |
| @ r0:r1 unnormalised mantissa |
| @ r2 -Q (will become exponent) |
| @ r3 sign across all bits |
| cmp r1,#0 |
| bne 1f @ short normalising shift? |
| movs r1,r0 |
| beq 2f @ zero? return it |
| movs r0,#0 |
| subs r2,#32 @ fix exponent |
| 1: |
| asrs r4,r1,#21 |
| bne 3f @ will need shift down (and rounding?) |
| bcs 4f @ normalised already? |
| 5: |
| subs r2,#1 |
| adds r0,r0 @ shift up |
| adcs r1,r1 |
| lsrs r4,r1,#21 |
| bcc 5b |
| 4: |
| ldr r4,=0x7fe |
| cmp r2,r4 |
| bhs 6f @ over/underflow? return signed zero/infinity |
| 7: |
| lsls r2,#20 @ pack and return |
| adds r1,r2 |
| lsls r3,#31 |
| adds r1,r3 |
| 2: |
| pop {r4,r5,r15} |
| 6: @ return signed zero/infinity according to unclamped exponent in r2 |
| mvns r2,r2 |
| lsrs r2,#21 |
| movs r0,#0 |
| movs r1,#0 |
| b 7b |
| |
| 3: |
| @ here we need to shift down to normalise and possibly round |
| bmi 1f @ already normalised to Q63? |
| 2: |
| subs r2,#1 |
| adds r0,r0 @ shift up |
| adcs r1,r1 |
| bpl 2b |
| 1: |
| @ here we have a 1 in b63 of r0:r1 |
| adds r2,#11 @ correct exponent for subsequent shift down |
| lsls r4,r0,#21 @ save bits for rounding |
| lsrs r0,#11 |
| lsls r5,r1,#21 |
| orrs r0,r5 |
| lsrs r1,#11 |
| lsls r4,#1 |
| beq 1f @ sticky bits are zero? |
| 8: |
| movs r4,#0 |
| adcs r0,r4 |
| adcs r1,r4 |
| b 4b |
| 1: |
| bcc 4b @ sticky bits are zero but not on rounding boundary |
| lsrs r4,r0,#1 @ increment if odd (force round to even) |
| b 8b |
| |
| |
| .ltorg |
| |
| double_section dunpacks |
| regular_func dunpacks |
| mdunpacks r0,r1,r2,r3,r4 |
| ldr r3,=0x3ff |
| subs r2,r3 @ exponent without offset |
| bx r14 |
| |
| @ r0:r1 signed mantissa Q52 |
| @ r2 unbiased exponent < 10 (i.e., |x|<2^10) |
| @ r4 pointer to: |
| @ - divisor reciprocal approximation r=1/d Q15 |
| @ - divisor d Q62 0..20 |
| @ - divisor d Q62 21..41 |
| @ - divisor d Q62 42..62 |
| @ returns: |
| @ r0:r1 reduced result y Q62, -0.6 d < y < 0.6 d (better in practice) |
| @ r2 quotient q (number of reductions) |
| @ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign |
| @ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6) |
| double_section dreduce |
| regular_func dreduce |
| adds r2,#2 @ e+2 |
| bmi 1f @ |x|<0.25, too small to need adjustment |
| cmp r2,#12 |
| bge 4f |
| 2: |
| movs r5,#17 |
| subs r5,r2 @ 15-e |
| movs r3,r1 @ Q20 |
| asrs r3,r5 @ x Q5 |
| adds r2,#8 @ e+10 |
| adds r5,#7 @ 22-e = 32-(e+10) |
| movs r6,r0 |
| lsrs r6,r5 |
| lsls r0,r2 |
| lsls r1,r2 |
| orrs r1,r6 @ r0:r1 x Q62 |
| ldmia r4,{r4-r7} |
| muls r3,r4 @ rx Q20 |
| asrs r2,r3,#20 |
| movs r3,#0 |
| adcs r2,r3 @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10 |
| muls r5,r2 @ qd in pieces: L Q62 |
| muls r6,r2 @ M Q41 |
| muls r7,r2 @ H Q20 |
| lsls r7,#10 |
| asrs r4,r6,#11 |
| lsls r6,#21 |
| adds r6,r5 |
| adcs r7,r4 |
| asrs r5,#31 |
| adds r7,r5 @ r6:r7 qd Q62 |
| subs r0,r6 |
| sbcs r1,r7 @ remainder Q62 |
| bx r14 |
| 4: |
| movs r2,#12 @ overflow: clamp to +/-1024 |
| movs r0,#0 |
| asrs r1,#31 |
| lsls r1,#1 |
| adds r1,#1 |
| lsls r1,#20 |
| b 2b |
| |
| 1: |
| lsls r1,#8 |
| lsrs r3,r0,#24 |
| orrs r1,r3 |
| lsls r0,#8 @ r0:r1 Q60, to be shifted down -r2 places |
| negs r3,r2 |
| adds r2,#32 @ shift down in r3, complementary shift in r2 |
| bmi 1f @ long shift? |
| 2: |
| movs r4,r1 |
| asrs r1,r3 |
| lsls r4,r2 |
| lsrs r0,r3 |
| orrs r0,r4 |
| movs r2,#0 @ rounding |
| adcs r0,r2 |
| adcs r1,r2 |
| bx r14 |
| |
| 1: |
| movs r0,r1 @ down 32 places |
| asrs r1,#31 |
| subs r3,#32 |
| adds r2,#32 |
| bpl 2b |
| movs r0,#0 @ very long shift? return 0 |
| movs r1,#0 |
| movs r2,#0 |
| bx r14 |
| |
| double_section dtan_shim |
| regular_func dtan_shim |
| push {r4-r7,r14} |
| bl push_r8_r11 |
| bl dsincos_internal |
| mov r12,r0 @ save ε |
| bl dcos_finish |
| push {r0,r1} |
| mov r0,r12 |
| bl dsin_finish |
| pop {r2,r3} |
| bl pop_r8_r11 |
| b ddiv0 @ compute sin θ/cos θ |
| |
| double_section dcos_shim |
| regular_func dcos_shim |
| push {r4-r7,r14} |
| bl push_r8_r11 |
| bl dsincos_internal |
| bl dcos_finish |
| b 1f |
| |
| double_section dsin_shim |
| regular_func dsin_shim |
| push {r4-r7,r14} |
| bl push_r8_r11 |
| bl dsincos_internal |
| bl dsin_finish |
| 1: |
| bl pop_r8_r11 |
| pop {r4-r7,r15} |
| |
| double_section dsincos_shim |
| |
| @ Note that this function returns in r0-r3 |
| regular_func dsincos_shim |
| |
| push {r4-r7,r14} |
| bl push_r8_r11 |
| bl dsincos_internal |
| mov r12,r0 @ save ε |
| bl dcos_finish |
| push {r0,r1} |
| mov r0,r12 |
| bl dsin_finish |
| pop {r2,r3} |
| bl pop_r8_r11 |
| pop {r4-r7,r15} |
| |
| double_section dtrig_guts |
| |
| @ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that |
| @ θ=α+ε and |ε|≤2^-32 |
| @ on return: |
| @ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0) |
| @ r8:r9 cos α Q62 |
| @ r10:r11 sin α Q62 |
| .align 2 |
| .thumb_func |
| dsincos_internal: |
| push {r14} |
| bl dunpacks |
| adr r4,dreddata0 |
| bl dreduce |
| |
| movs r4,#0 |
| ldr r5,=0x9df04dbb @ this value compensates for the non-unity scaling of the CORDIC rotations |
| ldr r6,=0x36f656c5 |
| lsls r2,#31 |
| bcc 1f |
| @ quadrant 2 or 3 |
| mvns r6,r6 |
| negs r5,r5 |
| adcs r6,r4 |
| 1: |
| lsls r2,#1 |
| bcs 1f |
| @ even quadrant |
| mov r10,r4 |
| mov r11,r4 |
| mov r8,r5 |
| mov r9,r6 |
| b 2f |
| 1: |
| @ odd quadrant |
| mov r8,r4 |
| mov r9,r4 |
| mov r10,r5 |
| mov r11,r6 |
| 2: |
| adr r4,dtab_cc |
| mov r12,r4 |
| movs r7,#1 |
| movs r6,#31 |
| 1: |
| bl dcordic_rot_step |
| adds r7,#1 |
| subs r6,#1 |
| cmp r7,#33 |
| bne 1b |
| pop {r15} |
| |
| .thumb_func |
| dcos_finish: |
| @ here |
| @ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0) |
| @ r8:r9 cos α Q62 |
| @ r10:r11 sin α Q62 |
| @ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α |
| mov r1,r11 |
| @ mov r2,r10 |
| @ lsrs r2,#31 |
| @ adds r1,r2 @ rounding improves accuracy very slightly |
| muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3 |
| @ r2:r3 ε sin α Q(62+62-32)=Q92 |
| mov r0,r8 |
| mov r1,r9 |
| lsls r5,r3,#2 |
| asrs r3,r3,#30 |
| lsrs r2,r2,#30 |
| orrs r2,r5 |
| sbcs r0,r2 @ include rounding |
| sbcs r1,r3 |
| movs r2,#62 |
| b fix642double_shim |
| |
| .thumb_func |
| dsin_finish: |
| @ here |
| @ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0) |
| @ r8:r9 cos α Q62 |
| @ r10:r11 sin α Q62 |
| @ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α |
| mov r1,r9 |
| muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3 |
| @ r2:r3 ε cos α Q(62+62-32)=Q92 |
| mov r0,r10 |
| mov r1,r11 |
| lsls r5,r3,#2 |
| asrs r3,r3,#30 |
| lsrs r2,r2,#30 |
| orrs r2,r5 |
| adcs r0,r2 @ include rounding |
| adcs r1,r3 |
| movs r2,#62 |
| b fix642double_shim |
| |
| .ltorg |
| .align 2 |
| dreddata0: |
| .word 0x0000517d @ 2/π Q15 |
| .word 0x0014611A @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces |
| .word 0x000A8885 |
| .word 0x001921FB |
| |
| |
| .align 2 |
| regular_func datan2_shim |
| @ r0:r1 y |
| @ r2:r3 x |
| push {r4-r7,r14} |
| bl push_r8_r11 |
| ldr r5,=0x7ff00000 |
| movs r4,r1 |
| ands r4,r5 @ y==0? |
| beq 1f |
| cmp r4,r5 @ or Inf/NaN? |
| bne 2f |
| 1: |
| lsrs r1,#20 @ flush |
| lsls r1,#20 |
| movs r0,#0 |
| 2: |
| movs r4,r3 |
| ands r4,r5 @ x==0? |
| beq 1f |
| cmp r4,r5 @ or Inf/NaN? |
| bne 2f |
| 1: |
| lsrs r3,#20 @ flush |
| lsls r3,#20 |
| movs r2,#0 |
| 2: |
| movs r6,#0 @ quadrant offset |
| lsls r5,#11 @ constant 0x80000000 |
| cmp r3,#0 |
| bpl 1f @ skip if x positive |
| movs r6,#2 |
| eors r3,r5 |
| eors r1,r5 |
| bmi 1f @ quadrant offset=+2 if y was positive |
| negs r6,r6 @ quadrant offset=-2 if y was negative |
| 1: |
| @ now in quadrant 0 or 3 |
| adds r7,r1,r5 @ r7=-r1 |
| bpl 1f |
| @ y>=0: in quadrant 0 |
| cmp r1,r3 |
| ble 2f @ y<~x so 0≤θ<~π/4: skip |
| adds r6,#1 |
| eors r1,r5 @ negate x |
| b 3f @ and exchange x and y = rotate by -π/2 |
| 1: |
| cmp r3,r7 |
| bge 2f @ -y<~x so -π/4<~θ≤0: skip |
| subs r6,#1 |
| eors r3,r5 @ negate y and ... |
| 3: |
| movs r7,r0 @ exchange x and y |
| movs r0,r2 |
| movs r2,r7 |
| movs r7,r1 |
| movs r1,r3 |
| movs r3,r7 |
| 2: |
| @ here -π/4<~θ<~π/4 |
| @ r6 has quadrant offset |
| push {r6} |
| cmp r2,#0 |
| bne 1f |
| cmp r3,#0 |
| beq 10f @ x==0 going into division? |
| lsls r4,r3,#1 |
| asrs r4,#21 |
| adds r4,#1 |
| bne 1f @ x==Inf going into division? |
| lsls r4,r1,#1 |
| asrs r4,#21 |
| adds r4,#1 @ y also ±Inf? |
| bne 10f |
| subs r1,#1 @ make them both just finite |
| subs r3,#1 |
| b 1f |
| |
| 10: |
| movs r0,#0 |
| movs r1,#0 |
| b 12f |
| |
| 1: |
| bl ddiv_shim |
| movs r2,#62 |
| bl double2fix64_shim |
| @ r0:r1 y/x |
| mov r10,r0 |
| mov r11,r1 |
| movs r0,#0 @ ω=0 |
| movs r1,#0 |
| mov r8,r0 |
| movs r2,#1 |
| lsls r2,#30 |
| mov r9,r2 @ x=1 |
| |
| adr r4,dtab_cc |
| mov r12,r4 |
| movs r7,#1 |
| movs r6,#31 |
| 1: |
| bl dcordic_vec_step |
| adds r7,#1 |
| subs r6,#1 |
| cmp r7,#33 |
| bne 1b |
| @ r0:r1 atan(y/x) Q62 |
| @ r8:r9 x residual Q62 |
| @ r10:r11 y residual Q62 |
| mov r2,r9 |
| mov r3,r10 |
| subs r2,#12 @ this makes atan(0)==0 |
| @ the following is basically a division residual y/x ~ atan(residual y/x) |
| movs r4,#1 |
| lsls r4,#29 |
| movs r7,#0 |
| 2: |
| lsrs r2,#1 |
| movs r3,r3 @ preserve carry |
| bmi 1f |
| sbcs r3,r2 |
| adds r0,r4 |
| adcs r1,r7 |
| lsrs r4,#1 |
| bne 2b |
| b 3f |
| 1: |
| adcs r3,r2 |
| subs r0,r4 |
| sbcs r1,r7 |
| lsrs r4,#1 |
| bne 2b |
| 3: |
| lsls r6,r1,#31 |
| asrs r1,#1 |
| lsrs r0,#1 |
| orrs r0,r6 @ Q61 |
| |
| 12: |
| pop {r6} |
| |
| cmp r6,#0 |
| beq 1f |
| ldr r4,=0x885A308D @ π/2 Q61 |
| ldr r5,=0x3243F6A8 |
| bpl 2f |
| mvns r4,r4 @ negative quadrant offset |
| mvns r5,r5 |
| 2: |
| lsls r6,#31 |
| bne 2f @ skip if quadrant offset is ±1 |
| adds r0,r4 |
| adcs r1,r5 |
| 2: |
| adds r0,r4 |
| adcs r1,r5 |
| 1: |
| movs r2,#61 |
| bl fix642double_shim |
| |
| bl pop_r8_r11 |
| pop {r4-r7,r15} |
| |
| .ltorg |
| |
| dtab_cc: |
| .word 0x61bb4f69, 0x1dac6705 @ atan 2^-1 Q62 |
| .word 0x96406eb1, 0x0fadbafc @ atan 2^-2 Q62 |
| .word 0xab0bdb72, 0x07f56ea6 @ atan 2^-3 Q62 |
| .word 0xe59fbd39, 0x03feab76 @ atan 2^-4 Q62 |
| .word 0xba97624b, 0x01ffd55b @ atan 2^-5 Q62 |
| .word 0xdddb94d6, 0x00fffaaa @ atan 2^-6 Q62 |
| .word 0x56eeea5d, 0x007fff55 @ atan 2^-7 Q62 |
| .word 0xaab7776e, 0x003fffea @ atan 2^-8 Q62 |
| .word 0x5555bbbc, 0x001ffffd @ atan 2^-9 Q62 |
| .word 0xaaaaadde, 0x000fffff @ atan 2^-10 Q62 |
| .word 0xf555556f, 0x0007ffff @ atan 2^-11 Q62 |
| .word 0xfeaaaaab, 0x0003ffff @ atan 2^-12 Q62 |
| .word 0xffd55555, 0x0001ffff @ atan 2^-13 Q62 |
| .word 0xfffaaaab, 0x0000ffff @ atan 2^-14 Q62 |
| .word 0xffff5555, 0x00007fff @ atan 2^-15 Q62 |
| .word 0xffffeaab, 0x00003fff @ atan 2^-16 Q62 |
| .word 0xfffffd55, 0x00001fff @ atan 2^-17 Q62 |
| .word 0xffffffab, 0x00000fff @ atan 2^-18 Q62 |
| .word 0xfffffff5, 0x000007ff @ atan 2^-19 Q62 |
| .word 0xffffffff, 0x000003ff @ atan 2^-20 Q62 |
| .word 0x00000000, 0x00000200 @ atan 2^-21 Q62 @ consider optimising these |
| .word 0x00000000, 0x00000100 @ atan 2^-22 Q62 |
| .word 0x00000000, 0x00000080 @ atan 2^-23 Q62 |
| .word 0x00000000, 0x00000040 @ atan 2^-24 Q62 |
| .word 0x00000000, 0x00000020 @ atan 2^-25 Q62 |
| .word 0x00000000, 0x00000010 @ atan 2^-26 Q62 |
| .word 0x00000000, 0x00000008 @ atan 2^-27 Q62 |
| .word 0x00000000, 0x00000004 @ atan 2^-28 Q62 |
| .word 0x00000000, 0x00000002 @ atan 2^-29 Q62 |
| .word 0x00000000, 0x00000001 @ atan 2^-30 Q62 |
| .word 0x80000000, 0x00000000 @ atan 2^-31 Q62 |
| .word 0x40000000, 0x00000000 @ atan 2^-32 Q62 |
| |
| double_section dexp_guts |
| regular_func dexp_shim |
| push {r4-r7,r14} |
| bl dunpacks |
| adr r4,dreddata1 |
| bl dreduce |
| cmp r1,#0 |
| bge 1f |
| ldr r4,=0xF473DE6B |
| ldr r5,=0x2C5C85FD @ ln2 Q62 |
| adds r0,r4 |
| adcs r1,r5 |
| subs r2,#1 |
| 1: |
| push {r2} |
| movs r7,#1 @ shift |
| adr r6,dtab_exp |
| movs r2,#0 |
| movs r3,#1 |
| lsls r3,#30 @ x=1 Q62 |
| |
| 3: |
| ldmia r6!,{r4,r5} |
| mov r12,r6 |
| subs r0,r4 |
| sbcs r1,r5 |
| bmi 1f |
| |
| negs r6,r7 |
| adds r6,#32 @ complementary shift |
| movs r5,r3 |
| asrs r5,r7 |
| movs r4,r3 |
| lsls r4,r6 |
| movs r6,r2 |
| lsrs r6,r7 @ rounding bit in carry |
| orrs r4,r6 |
| adcs r2,r4 |
| adcs r3,r5 @ x+=x>>i |
| b 2f |
| |
| 1: |
| adds r0,r4 @ restore argument |
| adcs r1,r5 |
| 2: |
| mov r6,r12 |
| adds r7,#1 |
| cmp r7,#33 |
| bne 3b |
| |
| @ here |
| @ r0:r1 ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0) |
| @ r2:r3 exp a Q62 |
| @ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε) |
| muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1 |
| @ r4:r1 ε exp a Q(62+62-32)=Q92 |
| lsrs r4,#30 |
| lsls r0,r1,#2 |
| orrs r0,r4 |
| asrs r1,#30 |
| adds r0,r2 |
| adcs r1,r3 |
| |
| pop {r2} |
| negs r2,r2 |
| adds r2,#62 |
| bl fix642double_shim @ in principle we can pack faster than this because we know the exponent |
| pop {r4-r7,r15} |
| |
| .ltorg |
| |
| .align 2 |
| regular_func dln_shim |
| push {r4-r7,r14} |
| lsls r7,r1,#1 |
| bcs 5f @ <0 ... |
| asrs r7,#21 |
| beq 5f @ ... or =0? return -Inf |
| adds r7,#1 |
| beq 6f @ Inf/NaN? return +Inf |
| bl dunpacks |
| push {r2} |
| lsls r1,#9 |
| lsrs r2,r0,#23 |
| orrs r1,r2 |
| lsls r0,#9 |
| @ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1 |
| |
| movs r7,#1 @ shift |
| adr r6,dtab_exp |
| mov r12,r6 |
| movs r2,#0 |
| movs r3,#0 @ y=0 Q62 |
| |
| 3: |
| negs r6,r7 |
| adds r6,#32 @ complementary shift |
| movs r5,r1 |
| asrs r5,r7 |
| movs r4,r1 |
| lsls r4,r6 |
| movs r6,r0 |
| lsrs r6,r7 |
| orrs r4,r6 @ x>>i, rounding bit in carry |
| adcs r4,r0 |
| adcs r5,r1 @ x+(x>>i) |
| |
| lsrs r6,r5,#30 |
| bne 1f @ x+(x>>i)>1? |
| movs r0,r4 |
| movs r1,r5 @ x+=x>>i |
| mov r6,r12 |
| ldmia r6!,{r4,r5} |
| subs r2,r4 |
| sbcs r3,r5 |
| |
| 1: |
| movs r4,#8 |
| add r12,r4 |
| adds r7,#1 |
| cmp r7,#33 |
| bne 3b |
| @ here: |
| @ r0:r1 residual x, nearly 1 Q62 |
| @ r2:r3 y ~ ln m/2 = ln m - ln2 Q62 |
| @ result is y + ln2 + ln x ~ y + ln2 + (x-1) |
| lsls r1,#2 |
| asrs r1,#2 @ x-1 |
| adds r2,r0 |
| adcs r3,r1 |
| |
| pop {r7} |
| @ here: |
| @ r2:r3 ln m/2 = ln m - ln2 Q62 |
| @ r7 unbiased exponent |
| .equ dreddata1_plus_4, (dreddata1+4) |
| adr r4,dreddata1_plus_4 |
| ldmia r4,{r0,r1,r4} |
| adds r7,#1 |
| muls r0,r7 @ Q62 |
| muls r1,r7 @ Q41 |
| muls r4,r7 @ Q20 |
| lsls r7,r1,#21 |
| asrs r1,#11 |
| asrs r5,r1,#31 |
| adds r0,r7 |
| adcs r1,r5 |
| lsls r7,r4,#10 |
| asrs r4,#22 |
| asrs r5,r1,#31 |
| adds r1,r7 |
| adcs r4,r5 |
| @ r0:r1:r4 exponent*ln2 Q62 |
| asrs r5,r3,#31 |
| adds r0,r2 |
| adcs r1,r3 |
| adcs r4,r5 |
| @ r0:r1:r4 result Q62 |
| movs r2,#62 |
| 1: |
| asrs r5,r1,#31 |
| cmp r4,r5 |
| beq 2f @ r4 a sign extension of r1? |
| lsrs r0,#4 @ no: shift down 4 places and try again |
| lsls r6,r1,#28 |
| orrs r0,r6 |
| lsrs r1,#4 |
| lsls r6,r4,#28 |
| orrs r1,r6 |
| asrs r4,#4 |
| subs r2,#4 |
| b 1b |
| 2: |
| bl fix642double_shim |
| pop {r4-r7,r15} |
| |
| 5: |
| ldr r1,=0xfff00000 |
| movs r0,#0 |
| pop {r4-r7,r15} |
| |
| 6: |
| ldr r1,=0x7ff00000 |
| movs r0,#0 |
| pop {r4-r7,r15} |
| |
| .ltorg |
| |
| .align 2 |
| dreddata1: |
| .word 0x0000B8AA @ 1/ln2 Q15 |
| .word 0x0013DE6B @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces |
| .word 0x000FEFA3 |
| .word 0x000B1721 |
| |
| dtab_exp: |
| .word 0xbf984bf3, 0x19f323ec @ log 1+2^-1 Q62 |
| .word 0xcd4d10d6, 0x0e47fbe3 @ log 1+2^-2 Q62 |
| .word 0x8abcb97a, 0x0789c1db @ log 1+2^-3 Q62 |
| .word 0x022c54cc, 0x03e14618 @ log 1+2^-4 Q62 |
| .word 0xe7833005, 0x01f829b0 @ log 1+2^-5 Q62 |
| .word 0x87e01f1e, 0x00fe0545 @ log 1+2^-6 Q62 |
| .word 0xac419e24, 0x007f80a9 @ log 1+2^-7 Q62 |
| .word 0x45621781, 0x003fe015 @ log 1+2^-8 Q62 |
| .word 0xa9ab10e6, 0x001ff802 @ log 1+2^-9 Q62 |
| .word 0x55455888, 0x000ffe00 @ log 1+2^-10 Q62 |
| .word 0x0aa9aac4, 0x0007ff80 @ log 1+2^-11 Q62 |
| .word 0x01554556, 0x0003ffe0 @ log 1+2^-12 Q62 |
| .word 0x002aa9ab, 0x0001fff8 @ log 1+2^-13 Q62 |
| .word 0x00055545, 0x0000fffe @ log 1+2^-14 Q62 |
| .word 0x8000aaaa, 0x00007fff @ log 1+2^-15 Q62 |
| .word 0xe0001555, 0x00003fff @ log 1+2^-16 Q62 |
| .word 0xf80002ab, 0x00001fff @ log 1+2^-17 Q62 |
| .word 0xfe000055, 0x00000fff @ log 1+2^-18 Q62 |
| .word 0xff80000b, 0x000007ff @ log 1+2^-19 Q62 |
| .word 0xffe00001, 0x000003ff @ log 1+2^-20 Q62 |
| .word 0xfff80000, 0x000001ff @ log 1+2^-21 Q62 |
| .word 0xfffe0000, 0x000000ff @ log 1+2^-22 Q62 |
| .word 0xffff8000, 0x0000007f @ log 1+2^-23 Q62 |
| .word 0xffffe000, 0x0000003f @ log 1+2^-24 Q62 |
| .word 0xfffff800, 0x0000001f @ log 1+2^-25 Q62 |
| .word 0xfffffe00, 0x0000000f @ log 1+2^-26 Q62 |
| .word 0xffffff80, 0x00000007 @ log 1+2^-27 Q62 |
| .word 0xffffffe0, 0x00000003 @ log 1+2^-28 Q62 |
| .word 0xfffffff8, 0x00000001 @ log 1+2^-29 Q62 |
| .word 0xfffffffe, 0x00000000 @ log 1+2^-30 Q62 |
| .word 0x80000000, 0x00000000 @ log 1+2^-31 Q62 |
| .word 0x40000000, 0x00000000 @ log 1+2^-32 Q62 |
| |
| |
| #endif |