| /* |
| * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause |
| */ |
| |
| #include "pico/asm_helper.S" |
| |
| #if !HAS_DOUBLE_COPROCESSOR |
| #error attempt to compile double_fma_rp2350 when there is no DCP |
| #else |
| |
| #include "hardware/dcp_instr.inc.S" |
| #include "hardware/dcp_canned.inc.S" |
| |
| pico_default_asm_setup |
| |
| // factor out save/restore (there is a copy in float code) |
| |
| .macro double_section name |
| #if PICO_DOUBLE_IN_RAM |
| .section RAM_SECTION_NAME(\name), "ax" |
| #else |
| .section SECTION_NAME(\name), "ax" |
| #endif |
| .endm |
| |
| .macro double_wrapper_section func |
| double_section WRAPPER_FUNC_NAME(\func) |
| .endm |
| |
| // ============== STATE SAVE AND RESTORE =============== |
| |
| .macro saving_func_return |
| bx lr |
| .endm |
| |
| double_section __rp2350_dcp_engaged_state_save_restore_copy |
| .thumb_func |
| __dcp_save_state: |
| sub sp, #24 |
| push {r0, r1} |
| // do save here |
| PXMD r0, r1 |
| strd r0, r1, [sp, #8 + 0] |
| PYMD r0, r1 |
| strd r0, r1, [sp, #8 + 8] |
| REFD r0, r1 |
| strd r0, r1, [sp, #8 + 16] |
| pop {r0, r1} |
| blx lr |
| // <- wrapped function returns here |
| // fall through into restore: |
| .thumb_func |
| __dcp_restore_state: |
| // do restore here |
| pop {r12, r14} |
| WXMD r12, r14 |
| pop {r12, r14} |
| WYMD r12, r14 |
| pop {r12, r14} |
| WEFD r12, r14 |
| pop {pc} |
| |
| double_wrapper_section __dfma |
| @ cf saving_func macro: but here we need to record the SP before the state save possibly changes it |
| 1: |
| push {lr} // 16-bit instruction |
| bl __dcp_save_state // 32-bit instruction |
| b 1f // 16-bit instruction |
| |
| @ compute mn+a with full intermediate precision |
| @ r0:r1 m |
| @ r2:r3 n |
| @ [r13,#0] a |
| wrapper_func fma |
| mov r12,sp @ save the SP |
| PCMP apsr_nzcv @ test the engaged flag |
| bmi 1b |
| 1: |
| push {r4-r8,r14} |
| ldrd r4,r5,[r12,#0] @ fetch a using original SP |
| ubfx r7,r1,#20,#11 @ r7=em |
| ubfx r8,r3,#20,#11 @ r8=en |
| add r8,r7,r8 @ em+en |
| |
| eors r6,r1,r3 @ get sign of mn |
| eors r6,r6,r5 @ set N if mn has opposite sign to a, i.e. if the operation is essentially a subtraction |
| WXUP r4,r5 @ write a to coprocessor to get its classification |
| PEFD r14,r12 @ r14=fa |
| WXUP r0,r1 @ write m and n to coprocessor to get their classifications |
| WYUP r2,r3 |
| PEFD r6,r12 @ r6=fm, r12=fn, r14=fa |
| orr r14,r14,r6 |
| orr r14,r14,r12 @ OR of all the classification flags, so we can check if any are zero/Inf/NaN |
| |
| RXMS r3,r6,0 @ we will almost always need the full product so compute it here (cf dmul macro) |
| RYMS r7,r12,0 |
| umull r0,r1,r3,r7 |
| mov r2,#0 @ seems to be no 16-bit instruction which zeros a register without affecting the flags |
| umlal r1,r2,r3,r12 |
| umlal r1,r2,r6,r7 |
| mov r3,#0 |
| umlal r2,r3,r6,r12 @ r0:r1:r2:r3: full product mn Q124 1≤mn<4 |
| |
| bmi 50f @ mn has opposite sign to a so operation is essentially a subtraction |
| |
| @ ======================== ADDITION PATH ======================== |
| |
| tst r14,#0x70000000 @ were any of the arguments zero/inf/NaN? |
| bne 90f @ then use mla path which gives the correct result in all these cases |
| ubfx r14,r5,#20,#11 @ r14=ea |
| @ here all operands are finite and non-zero |
| @ r0:r1:r2:r3: full product mn Q124 1≤mn<4 |
| @ r4:r5 a IEEE packed |
| @ r8: em+en [biased +0x3ff*2] |
| @ r14: ea [biased +0x3ff] |
| subw r7,r8,#0x3fd |
| subs r7,r7,r14 @ em+en-ea+2 (debiased) |
| blt 80f @ branch if |a| is big compared to |mn|, more precisely if ea-(em+en)≥3 so e.g. if ea=0 (hence 1≤a<2) then em+en≤-3 and mn<4.2¯³=1/2 |
| |
| @ ======================== ADDITION PATH, RESULT HAS COMPARABLE MAGNITUDE TO mn ======================== |
| |
| @ here |mn| is big compared to |a|; e.g. if em+en=0 (so 1≤mn<4) then ea≤2 and a<8 |
| movs r8,#1 |
| bfi r5,r8,#20,#12 @ insert implied 1 in a |
| rsbs r7,r7,#74 @ shift up ≤74 (can be negative) that will be required for a (Q52) to align with mn (Q124, ending in 20 zeros) |
| @ now add (shifted) a into mn, preserving flags |
| and r8,r7,#0x1f @ k=shift mod 32 |
| mov r12,#1 |
| lsl r12,r12,r8 @ 2^k |
| umull r5,r6,r5,r12 @ shift up high word: r4:r5:r6 is now a_lo + 2^k a_hi |
| sub r12,#1 @ 2^k-1 |
| umlal r4,r5,r4,r12 @ shift up low word, adding in: r4:r5:r6 is now (a_lo + 2^k a_hi) + (2^k-1) a_lo = 2^k (a_lo + a_hi) = a shifted up by k |
| bmi 91f @ use flags: will a be shifted down? |
| cmp r7,#64 @ shift up by two more words? |
| bge 92f |
| cmp r7,#32 @ shift up by one more word? |
| bge 93f |
| adds r0,r0,r4 @ no more word shifts |
| adcs r1,r1,r5 |
| adcs r2,r2,r6 |
| adcs r3,r3,#0 @ r0:r1:r2:r3: mn + a (cf dmul macro) |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD @ as dmul macro tail: exponent computed in coprocessor is correct |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 93: |
| adds r1,r1,r4 |
| adcs r2,r2,r5 |
| adcs r3,r3,r6 @ r0:r1:r2:r3: mn + (a<<32) |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 92: |
| adds r2,r2,r4 |
| adcs r3,r3,r5 @ r0:r1:r2:r3: mn + (a<<64); note this cannot overflow as total shift was at most 74 (see above) |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 91: @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done |
| @ r0:r1:r2:r3: mn |
| @ r4:r5:r6: a |
| @ r7: alignment shift required (negative) |
| cmn r7,#32 @ shift down one word? |
| bge 94f |
| cmn r7,#64 @ shift down two words? |
| bge 95f |
| @ here a is shifted entirely below the bottom of m |
| orr r0,r0,#1 @ a is non-zero so ensure we set the sticky bit |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 94: |
| adds r0,r0,r5 @ one word shift down |
| adcs r1,r1,r6 |
| adcs r2,r2,#0 |
| adcs r3,r3,#0 |
| orr r0,r0,r4 @ contribution from a to sticky bits |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 95: |
| adds r0,r0,r6 @ two word shift down |
| adcs r1,r1,#0 |
| adcs r2,r2,#0 |
| adcs r3,r3,#0 |
| orr r0,r0,r4 @ contribution from a to sticky bits |
| orr r0,r0,r5 |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| @ ======================== ADDITION PATH, RESULT HAS COMPARABLE MAGNITUDE TO a ======================== |
| |
| 80: |
| @ here |mn|<~|a| |
| @ r0:r1:r2:r3: mn Q124 |
| @ r4:r5 a IEEE packed |
| @ r7: -(shift down required to align mn with a), guaranteed negative |
| @ r8: em+en [biased +0x3ff*2] |
| @ r14: ea [biased +0x3ff] |
| tst r3,#0x20000000 |
| bne 1f @ 2≤mn<4? |
| adds r2,r2,r2 @ normalise so mn is 2..4 Q124; note that the contents of r0 and r1 are always destined for the sticky bit in this path |
| adcs r3,r3,r3 |
| subs r7,r7,#1 @ correction to alignment shift |
| 1: |
| @ now we construct an IEEE packed value in r2:r3 such that adding it to r4:r5 gives the correct final result |
| @ observe that the exponent of this constructed value will be at least two less than that of a (by the "|a| is big compared to |mn|" test above) |
| @ so the alignment shift in the final addition will be by at least two places; thus we can use bit 0 of the constructed |
| @ value as a sticky bit, and we still have one bit in hand for rounding |
| subs r7,r7,#2 @ now r7 < -2 |
| orr r0,r0,r2,lsl#23 @ shift r2:r3 down 9 places, ORing excess into sticky bits |
| lsrs r2,r2,#9 |
| orr r2,r2,r3,lsl#23 |
| lsrs r3,r3,#9 |
| orrs r0,r0,r1 |
| it ne |
| orrne r2,r2,#1 @ sticky bit from bottom 64 bits of mn as shifted |
| @ r2:r3 mn 2..4 Q51, i.e. 1..2 Q52 |
| @ r2b0 holds sticky bit; note that for alignment with a in r4:r5, r2:r3 will be shifted down at least one place |
| |
| lsrs r6,r5,#31 @ get sign of a (which in this path is the same as the sign of mn, and of the result) |
| orr r3,r3,r6,lsl#31 @ set sign in mn |
| |
| adds r14,r7,r14 @ get exponent for mn relative to a; note this can go negative |
| add r3,r3,r14,lsl#20 @ note that "implied" 1 is present in r3, giving an offset of 1 in the exponent |
| bmi 1f @ negative? then we have just constructed a denormal (or less) and the addition will give an incorrect result |
| dcp_dadd_m r0,r1,r2,r3,r4,r5 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 1: |
| @ compare with similar code in subtraction path: here we cannot underflow |
| cmn r7,#64 @ if the alignment shift for mn is very large then the result is just a |
| ble 82f |
| add r3,r3,#0x40000000 @ ea cannot be very large (as adding r7 made it negative), so safe to add 1024 to exponents of both a and mn |
| add r5,r5,#0x40000000 |
| dcp_dadd_m r0,r1,r2,r3,r4,r5 |
| sub r1,r1,#0x40000000 @ fix exponent |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 90: |
| @ dcp_dmul_m tail then dadd ("mla path") |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| dcp_dadd_m r0,r1,r0,r1,r4,r5 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 82: @ |mn| is very small compared to |a|, so result is a |
| RDDM r0,r1 @ clear the engaged flag |
| movs r0,r4 |
| movs r1,r5 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| @ ======================== SUBTRACTION PATH ======================== |
| |
| 50: |
| tst r14,#0x70000000 @ were any of the arguments zero/inf/NaN? |
| bne 90b @ then use mla path which gives the correct result in all these cases |
| ubfx r14,r5,#20,#11 @ r14=ea |
| @ now all operands are finite and non-zero |
| @ r0:r1:r2:r3: full product mn Q124 1≤mn<4 |
| @ r4:r5 a IEEE packed (including sign bit; sign of mn is opposite as we are in the subtraction path) |
| @ r8: em+en [+0x3ff*2] |
| @ r14: ea [+0x3ff] |
| subw r8,r8,#0x3fc @ em+en+3 |
| subs r7,r8,r14 @ em+en-ea+3 (debiased) |
| blt 80f @ branch if |a| is big compared to |mn|, more precisely if ea-(em+en)≥4 so e.g. if ea=0 then em+en≤-4 and mn<4.2^-4=1/4 |
| beq 94f @ branch if ea-(em+en)=3 e.g. if ea=0 then em+en=-3 and 1/8=2^-3≤mn<4.2^-3=1/2 |
| @ in this branch, if e.g. em+en=0 (so 1≤mn<4) then ea≤2 and a<8 |
| rsbs r7,r7,#75 @ 75-(em+en-ea+3) = 72-(em+en-ea), shift up 0..74 that will be required for a (Q52) to align with mn (Q124, ending in 20 zeros) |
| mvn r14,r5,lsr#31 @ save complement of sign of a |
| @ subtract (shifted) a from mn |
| and r6,r7,#0x1f @ k=shift mod 32 |
| mov r12,#1 |
| bfi r5,r12,#20,#12 @ insert implied 1 in a |
| lsl r12,r12,r6 @ 2^k |
| umull r5,r6,r5,r12 |
| sub r12,#1 |
| umlal r4,r5,r4,r12 @ shift a up by shift amount mod 32 (see comment in addition path) |
| @ r4:r5:r6: a shifted up by k=shift mod 32 |
| bmi 91f @ will a be shifted down? |
| cmp r7,#64 @ shift up by two more words? |
| bge 92f |
| cmp r7,#32 @ shift up by one more word? |
| bge 93f |
| subs r0,r0,r4 @ no more word shifts; this cannot go negative or have bad cancellation |
| sbcs r1,r1,r5 |
| sbcs r2,r2,r6 |
| sbcs r3,r3,#0 @ r0:r1:r2:r3: mn - a (cf dmul macro) |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD @ as dmul macro tail: exponent and sign computed in coprocessor is correct |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 94: |
| @ here if ea-(em+en)=3 e.g. if ea=0 then em+en=-3 and 1/8=2^-3≤mn<4.2^-3=1/2 |
| @ r0:r1:r2:r3: full product mn Q124 1≤mn<4 |
| @ r4:r5 a IEEE packed (including sign bit; sign of mn is opposite as we are in the subtraction path) |
| lsls r5,r5,#11 @ convert a to mantissa Q63 in r4:r5 |
| orrs r5,r5,r4,lsr#21 |
| lsls r4,r4,#11 |
| orrs r5,r5,0x80000000 @ implied 1 |
| movs r6,#0 |
| subs r0,r6,r0 @ compute |a|-|mn| |
| sbcs r6,r6,r1 |
| sbcs r4,r4,r2 |
| sbcs r5,r5,r3 |
| WXMS r0,r6 @ write sticky bits |
| WXMO r4,r5 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| eor r1,r1,0x80000000 @ sign of result is opposite to that of product as yielded by coprocessor |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 93: |
| subs r1,r1,r4 @ shifting a up by one word: this cannot go negative or have bad cancellation |
| sbcs r2,r2,r5 |
| sbcs r3,r3,r6 |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 92: |
| subs r2,r2,r4 @ shifting a up by two words: this /can/ go negative or have bad cancellation |
| sbcs r3,r3,r5 |
| cmp r3,#0x01000000 @ check we have at least 57 bits of product so that dmul tail will round correctly (this test is slightly conservative - 55 needed?) |
| blt 1f @ also trap case where result is negative |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| @ heavy cancellation case |
| @ r0:r1:r2:r3: result Q124, signed |
| @ r8: em+en+3 |
| @ r14b0: save complement of sign of a |
| 1: |
| sub r8,r8,#1 @ em+en+2 |
| RDDM r6,r7 @ clear engaged flag |
| blo 2f @ if result is negative... |
| movs r6,#0 @ ... negate it... |
| subs r0,r6,r0 |
| sbcs r1,r6,r1 |
| sbcs r2,r6,r2 |
| sbcs r3,r6,r3 |
| eor r14,r14,#1 @ ... and flip saved sign |
| 2: @ now normalise result |
| orrs r6,r2,r3 @ shift up by 64 possible? |
| bne 7f |
| movs r3,r1 @ do it |
| movs r2,r0 |
| movs r1,#0 |
| movs r0,#0 |
| sub r8,r8,#64 @ fix exponent |
| 7: |
| cmp r3,#0 @ shift up by 32 possible? |
| bne 8f |
| movs r3,r2 @ do it |
| movs r2,r1 |
| movs r1,r0 |
| movs r0,#0 |
| sub r8,r8,#32 |
| 8: |
| cmp r3,#0 @ is result zero? return it |
| beq 9f |
| clz r6,r3 @ k=amount of final shift |
| subs r8,r8,r6 @ final exponent |
| movs r7,#1 |
| lsls r7,r7,r6 @ r7=2^k |
| muls r3,r3,r7 |
| subs r7,r7,#1 @ 2^k-1 |
| umlal r2,r3,r2,r7 |
| umlal r1,r2,r1,r7 |
| umlal r0,r1,r0,r7 @ r0:r1:r2:r3: normalised result |
| orrs r0,r0,r1 @ any sticky bits below top 64? |
| it ne |
| orrne r2,r2,#1 @ or into sticky bit |
| lsrs r0,r2,#11 @ align to mantissa position for IEEE format |
| lsrs r1,r3,#11 |
| orr r0,r0,r3,lsl#21 |
| lsls r2,r2,#22 @ rounding bit in C, sticky bit in ~Z |
| bcc 10f @ no rounding? |
| beq 11f @ rounding tie? |
| adcs r0,r0,#0 @ round up (C is set) |
| adcs r1,r1,#0 |
| adds r8,r8,r1,lsr#20 @ candidate for exponent field |
| ble 12f @ underflow? overflow cannot occur here as the result is smaller in magnitude than a |
| bfi r1,r8,#20,#11 @ insert exponent |
| orr r1,r1,r14,lsl#31 @ or in sign |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 11: |
| adcs r0,r0,#0 @ round up as above |
| adcs r1,r1,#0 |
| bic r0,r0,#1 @ to even |
| adds r8,r8,r1,lsr#20 @ candidate for exponent field |
| ble 12f @ underflow? |
| bfi r1,r8,#20,#11 @ insert exponent |
| orr r1,r1,r14,lsl#31 @ or in sign |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 10: |
| adds r8,r8,r1,lsr#20 @ candidate for exponent field |
| ble 12f @ underflow? |
| bfi r1,r8,#20,#11 @ insert exponent |
| orr r1,r1,r14,lsl#31 @ or in sign |
| 9: |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 12: |
| mov r1,r14,lsl#31 @ underflow: return signed zero |
| movs r0,#0 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 91: @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done |
| @ r0:r1:r2:r3: mn |
| @ r4:r5:r6: a |
| @ r7: alignment shift required (negative) |
| cmn r7,#32 @ shift down one word? |
| bge 94f |
| cmn r7,#64 @ shift down two words? |
| bge 95f |
| @ here a is shifted entirely below the bottom of m |
| subs r0,r0,#1 @ subtract an epsilon (a is non-zero) |
| sbcs r1,r1,#0 |
| sbcs r2,r2,#0 |
| sbcs r3,r3,#0 |
| orr r0,r0,#1 @ ensure the sticky bit is set (a is non-zero) |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 94: |
| rsbs r4,r4,#0 @ one word shift down |
| sbcs r0,r0,r5 |
| sbcs r1,r1,r6 |
| sbcs r2,r2,#0 |
| sbcs r3,r3,#0 |
| orr r0,r0,r4 @ sticky bits |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 95: |
| movs r7,#0 @ two words shift down |
| subs r4,r7,r4 |
| sbcs r5,r7,r5 |
| sbcs r0,r0,r6 |
| sbcs r1,r1,r7 |
| sbcs r2,r2,r7 |
| sbcs r3,r3,r7 |
| orrs r0,r0,r4 @ sticky bits |
| orrs r0,r0,r5 |
| WXMS r0,r1 @ write sticky bits |
| WXMO r2,r3 @ write sticky+result bits |
| NRDD |
| RDDM r0,r1 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 80: |
| @ here |a| is big compared to |mn|, more precisely ea-(em+en)≥4 so e.g. if ea=0 then em+en≤-4 and mn<4.2^-4=1/4 |
| @ r0:r1:r2:r3: mn Q124 |
| @ r4:r5: a IEEE packed |
| @ r7<0, em+en-ea+3 (debiased) |
| @ r14: ea [+0x3ff] |
| lsrs r6,r3,#29 |
| bne 1f @ 2≤mn<4? |
| adds r2,r2,r2 @ shift up one place |
| adcs r3,r3,r3 |
| subs r7,r7,#1 @ fix exponent |
| 1: @ now r2:r3 is mn Q61, sticky bits in r0:r1 |
| subs r7,r7,#3 |
| @ r7=emn-ea <-3 |
| orr r0,r0,r2,lsl#23 @ gather sticky bits |
| lsrs r2,r2,#9 @ adjust mn to Q52 ready to create packed IEEE version of mn |
| orr r2,r2,r3,lsl#23 |
| lsrs r3,r3,#9 |
| orrs r0,r0,r1 @ or of all sticky bits |
| it ne |
| orrne r2,r2,#1 @ sticky bit from bottom 64 bits of mn |
| |
| mvn r6,r5,lsr#31 @ complement of sign of a |
| orr r3,r3,r6,lsl#31 @ fix sign of mn so we do a subtraction |
| |
| adds r14,r7,r14 @ this can go negative; r14 is now at most ea[+0x3ff]-4 |
| add r3,r3,r14,lsl#20 |
| @ the exponent field in r2:r3 (mn) is now at most ea[+0x3ff]-3 |
| @ that means that in the dadd operation that follows, mn will be shifted down at least three places to align with a, |
| @ and a post-normalisation shift up of at most one place will be needed |
| @ therefore in the worst case r2b2 affects b0 of the result; r2b1 affects the rounding of the result; and r2b0 can be used as a sticky bit |
| bmi 1f @ did exponent go negative? |
| |
| dcp_dadd_m r0,r1,r2,r3,r4,r5 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| 1: |
| cmn r7,#64 @ is mn being shifted well below the bottom of a? |
| ble 82b @ then result is just a |
| add r3,r3,#0x40000000 @ otherwise offset exponents by +1024 |
| add r5,r5,#0x40000000 |
| dcp_dadd_m r0,r1,r2,r3,r4,r5 |
| ubfx r2,r1,#20,#11 @ get exponent |
| cmp r2,#0x400 @ too small? |
| itte ls |
| andls r1,r1,0x80000000 @ flush to signed zero |
| movls r0,#0 |
| subhi r1,r1,#0x40000000 @ else fix exponent of result |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4-r8,lr} |
| saving_func_return |
| |
| |
| double_wrapper_section __dmla |
| @ cf saving_func macro: but here we need to record the SP before the state save possibly changes it |
| 1: |
| push {lr} // 16-bit instruction |
| bl __dcp_save_state // 32-bit instruction |
| b 1f // 16-bit instruction |
| |
| @ r0:r1 m |
| @ r2:r3 n |
| @ [r13,#0] a |
| regular_func mla |
| mov r12,sp @ save the SP |
| PCMP apsr_nzcv @ test the engaged flag |
| bmi 1b |
| 1: |
| push {r4,r5,r14} |
| dcp_dmul_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r4,r5,r14 |
| ldrd r2,r3,[r12,#0] @ fetch a using original SP |
| dcp_dadd_m r0,r1,r0,r1,r2,r3 |
| // todo optimize this based on final decision on saving_func_entry |
| pop {r4,r5,r14} |
| saving_func_return |
| |
| #endif |