| /* MEMORY LAYOUT ASSUMPTIONS |
| |
| The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see |
| the macro getchaffaddress. |
| |
| The stack must be located at the end of Y scratch RAM: see the memory |
| wiping at the end of ctr_crypt_s where memory between the start of Y |
| scratch RAM and the stack pointer is overwritten. |
| */ |
| |
| .syntax unified |
| .cpu cortex-m33 |
| .thumb |
| |
| #include "config.h" |
| #include "hardware/platform_defs.h" |
| #include "hardware/regs/addressmap.h" |
| #include "hardware/regs/clocks.h" |
| #include "hardware/regs/sha256.h" |
| #include "hardware/regs/resets.h" |
| #include "hardware/regs/rosc.h" |
| #include "hardware/regs/trng.h" |
| #include "hardware/rcp.h" |
| |
| #if HARDENING |
| @ Number of calls to gen_rand_sha[_nonpres] |
| #define RND_COUNT_decrypt 394 // From decrypt up to call to ctr_crypt_s |
| #define RND_COUNT_ctr_crypt_s_init (17 + 32 * CT_BPERM) // Init phase of ctr_crypt_s |
| #define RND_COUNT_ctr_crypt_mainloop_A (4 + ST_VPERM + ST_SHAREC) |
| #define RND_COUNT_refreshchaff_and_lfsr 2 |
| #define RND_COUNT_remap 2 |
| #define RND_COUNT_decryption_end 3 |
| #endif |
| |
| .global decrypt |
| .global chaff |
| |
| .extern lock_key |
| |
| @ RCP macros |
| |
| #define CTAG0 0x2a |
| #define CTAG1 0x2b |
| #define CTAG2 0x2c |
| #define CTAG3 0x2d |
| #define CTAG4 0x2e |
| #define CTAG5 0x30 |
| #define CTAG6 0x31 |
| #define CTAG7 0x32 |
| #define CTAG8 0x33 |
| #define CTAG9 0x34 |
| #define CTAG10 0x35 |
| #define CTAG11 0x36 |
| #define CTAG12 0x37 |
| #define CTAG13 0x38 |
| #define CTAG14 0x39 |
| #define CTAG15 0x3a |
| #define CTAG16 0x3b |
| #define CTAG17 0x3c |
| #define CTAG18 0x3d |
| #define CTAG19 0x3e |
| #define CTAG20 0x3f |
| #define CTAG21 0x29 |
| |
| @ number of blocks from the TRNG processed to initialise rstate_sha |
| #define TRNG_BLOCKS 25 |
| |
| @ if GEN_RAND_SHA==0 then we don't call the counting version |
| #if HARDENING && GEN_RAND_SHA |
| .macro check_rnd_count count |
| .if !(\count & 0xffffff00) |
| movs r1, #\count |
| .else |
| ldr r1, =\count |
| .endif |
| movs r0, #(\count & 1) ^ 1 |
| bl check_rnd_count_func |
| rcp_iequal_nodelay r1, r0 |
| #if DOUBLE_HARDENING |
| rcp_iequal_nodelay r0, r1 |
| #endif |
| .endm |
| |
| @ r1 has the expected count |
| @ Trashes r0 |
| .macro check_rnd_count_dynamic |
| mov r0, sp |
| bl check_rnd_count_func |
| rcp_iequal_nodelay r1, r0 |
| #if DOUBLE_HARDENING |
| rcp_iequal_nodelay r0, r1 |
| #endif |
| .endm |
| |
| .macro reset_rnd_count |
| bl reset_rnd_count_func |
| .endm |
| |
| .macro reset_rnd_count_checked |
| @ This version verifies that the count was actually reset |
| uxtb r0, r1 |
| bl reset_rnd_count_func |
| ldr r0, [r0] |
| bics r1, #0xff00ff |
| rcp_iequal_nodelay r1, r0 |
| .endm |
| |
| #else |
| .macro check_rnd_count count |
| .endm |
| .macro reset_rnd_count |
| .endm |
| .macro reset_rnd_count_checked |
| .endm |
| #endif |
| @ The lower jitterpriorty is, the more the jitter |
| .macro SET_COUNT n,jitterpriority |
| .if RC_COUNT |
| .if RC_JITTER > \jitterpriority |
| rcp_count_set \n |
| .else |
| rcp_count_set_nodelay \n |
| .endif |
| .endif |
| .endm |
| |
| .macro CHK_COUNT n,jitterpriority |
| .if RC_COUNT |
| .if RC_JITTER > \jitterpriority |
| rcp_count_check \n |
| .else |
| rcp_count_check_nodelay \n |
| .endif |
| .endif |
| .endm |
| |
| .macro GET_CANARY rx,tag,jitterpriority |
| .if RC_CANARY |
| .if RC_JITTER > \jitterpriority |
| rcp_canary_get \rx,\tag |
| .else |
| rcp_canary_get_nodelay \rx,\tag |
| .endif |
| .endif |
| .endm |
| |
| .macro CHK_CANARY rx,tag,jitterpriority |
| .if RC_CANARY |
| .if RC_JITTER > \jitterpriority |
| rcp_canary_check \rx,\tag |
| .else |
| rcp_canary_check_nodelay \rx,\tag |
| .endif |
| .endif |
| .endm |
| |
| @ Clear internal stripe load registers, and r0-r3 |
| @ 0 <= offset <= 32 |
| .macro clear03 offset=0 |
| getchaffaddress r0,\offset |
| ldmia r0,{r0-r3} |
| .endm |
| |
| .macro clear03_preserve_r3 offset=0 |
| getchaffaddress r0,\offset |
| ldmia r0!,{r1-r2} |
| ldmia r0!,{r1-r2} |
| .endm |
| |
| .macro clear01 offset=0 |
| getchaffaddress r0,\offset |
| ldmia r0,{r0,r1} |
| .endm |
| |
| @ Put workspace in the second scratch area |
| @ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants, |
| @ otherwise they may end up silently replaced with 0 or 0xffffffff |
| .section .scratch_y.aes,"aw",%progbits |
| |
| workspace_start: |
| |
| @ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress |
| @ We need to set the chaff address directly with MOVs, rather than setting it with a load as normal, because at the point |
| @ the macro is called we have just done a load of a sensitive value at a known memory offset mod 16, and the idea is that |
| @ the next load is going to be of a random number (in the "chaff" memory) at that same offset mod 16, so we can't afford |
| @ to do a ldr \rx, =0x20081000 + \offset first, as this will load a non-random value from an uncontrolled memory location mod 16. |
| @ Ideally we'd avoid the magic number 0x2008100 by using, ADR \rx, chaff+\offset, but the linker does not support this. |
| .macro getchaffaddress rx,offset=0 |
| mov \rx,#(0x1000+\offset) |
| movt \rx,#0x2008 |
| .endm |
| chaff: |
| .space 48 |
| |
| .balign 16 |
| rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words |
| @ see comment at init_key_4way for description of layout and meaning of rkey_s |
| .space 600 |
| rkey4way: @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space |
| .space 128 |
| .if CT_BPERM |
| bperm_rand: @ 32 half words that define the oblivious permutation of blocks |
| .space 64 |
| .endif |
| |
| .balign 16 |
| permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) |
| perm16: |
| .space 16 |
| @ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s |
| .balign 16 |
| fourway: @ Must be 0 mod 16 |
| shareA: @ 0 mod 16 |
| .space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 |
| shareB: @ 4 mod 16 |
| .space 20 |
| shareC: @ 8 mod 16 |
| .space 4 |
| statevperm: @ 12 mod 16 |
| .space 4 @ vperm state rotation: only last two bits are operational; other bits random |
| RKshareC: @ Round key common share C; see comment at init_key_4way for explanation |
| .space 4 |
| RKshareCchange: @ Temporary used by ref_roundkey_shares_s |
| .space 4 |
| IV0: @ 2-way share of IV for block 0 |
| .space 36 @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16) |
| @ The gap at IV0[4] is to defeat unsharing by internal striped memory registers |
| @ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless |
| |
| @ Regardless of configuration, the code uses a single 256-entry LUT, |
| @ which is a simple S-box table. |
| @ The LUT is represented as two shares, lut_a and lut_b, |
| @ whose values must be EORed. Furthermore, the contents of each share are |
| @ scambled according to a 4-byte "map". The map comprises two bytes that |
| @ are EORed into the addressing of the share, and two bytes that are |
| @ EORed into the data read back from the share. Performing a lookup |
| @ of a value x involves computing |
| @ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ c₁ ^ lut_b[x ^ b₀ ^ b₁] ^ d₀ ^ d₁ |
| @ where a₀, a₁, c₀ and c₁ are the "map" of the lut_a share and |
| @ b₀, b₁, d₀ and d₁ are the "map" of the lut_b share. |
| @ In practice the result of a lookup is itself represented in two |
| @ shares, namely |
| @ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀ and |
| @ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁ |
| .balign 16 |
| lut_a: @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup) |
| .byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 |
| .byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 |
| .byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 |
| .byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 |
| .byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 |
| .byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf |
| .byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 |
| .byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 |
| .byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 |
| .byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb |
| .byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 |
| .byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 |
| .byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a |
| .byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e |
| .byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf |
| .byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 |
| lut_a_map: @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b |
| .space 4 |
| .space 4 @ align to 8 mod 16 |
| lut_b: @ LUT share B (must be 8 mod 16 so that init_key_sbox knows how to mask the lookup) |
| .space 256 |
| lut_b_map: |
| .space 4 |
| .space 4 @ align to multiple of 8 |
| |
| .balign 16 |
| rstate_all_start: @ Mark start of RNG data to allow selective memory wipe |
| rstate_sha: @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero |
| .space 16 |
| jstate: @ 32-bit jitter state |
| .space 4 |
| rstate_lfsr: @ 32-bit LFSR random state and constant used to step it |
| .space 4 |
| .word 0x1d872b41 @ constant that defines a maximal-length LFSR |
| rstate_count: |
| .space 4 |
| rstate_all_end: @ Mark end of RNG data to allow selective memory wipe |
| |
| .if CT_BPERM |
| .balign 16 |
| murmur3_constants: @ Five constants used in murmur3_32 hash |
| .word 0xcc9e2d51 |
| .word 0x1b873593 |
| .word 0xe6546b64 |
| .word 0x85ebca6b |
| .word 0xc2b2ae35 |
| .endif |
| |
| scratch_y_end: |
| |
| @ Initialisation code in main .text section |
| .section .text,"ax",%progbits |
| |
| @ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments. |
| @ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some |
| @ random numbers. |
| @ Trashes r0-r6 |
| .balign 4 |
| init_rstate: |
| CHK_COUNT 24,6 |
| ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET |
| ldr r5,=SHA256_BASE |
| movs r1,#1 |
| str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] |
| ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] @ reads as 0 |
| movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS @ initialise SHA internal state by writing START bit |
| str r1,[r5,#SHA256_CSR_OFFSET] |
| str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET -TRNG_RNG_IMR_OFFSET] |
| #if HARDENING |
| movs r3, #0 |
| #endif |
| movs r6,#TRNG_BLOCKS*2+1 @ odd so that we break out of the loop half-way through loading the SHA hardware, giving |
| @ time for previous SHA computation to complete |
| 2: |
| movs r1,#0xff @ TRNG setup is inside loop in case it is skipped. |
| str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET] @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples |
| str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] @ start ROSC if it is not already started |
| str r1,[r4,#TRNG_RNG_ICR_OFFSET -TRNG_RNG_IMR_OFFSET] @ clear all interrupts (including EHR_VLD) |
| adds r0,r4,#TRNG_EHR_DATA0_OFFSET -TRNG_RNG_IMR_OFFSET |
| 1: |
| ldr r1,[r4,#TRNG_TRNG_BUSY_OFFSET -TRNG_RNG_IMR_OFFSET] @ wait for 192 ROSC samples to fill EHR,should take constant time |
| cmp r1,#0 |
| bne 1b |
| subs r6,#1 @ done? |
| beq 3f |
| movs r1,#8 |
| 1: |
| ldmia r0!,{r2} @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1) |
| str r2,[r5,#SHA256_WDATA_OFFSET] @ for a total of half a SHA-256 block |
| #if HARDENING |
| adds r3,#1 |
| #endif |
| subs r1,#1 |
| bne 1b |
| #if HARDENING |
| ldr r1, =TRNG_BASE+TRNG_EHR_DATA0_OFFSET+32 |
| rcp_iequal_nodelay r0, r1 |
| #endif |
| ldr r2,[r5,#SHA256_SUM0_OFFSET] @ TRNG is now sampling again; use some SHA bits to modulate the chain length |
| str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] |
| b.n 2b |
| |
| 3: |
| #if HARDENING |
| movs r2, #(TRNG_BLOCKS*2) * 8 |
| rcp_iequal_nodelay r2, r3 |
| #endif |
| #if HARDENING |
| @ good test that we are dealing with real hardware |
| ldr r2,[r5,#SHA256_CSR_OFFSET] |
| movw r1,#SHA256_CSR_RESET |
| rcp_iequal_nodelay r1, r2 |
| rcp_iequal_nodelay r2, r1 |
| #endif |
| CHK_COUNT 25,6 |
| str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] @ turn off rand source and wipe SHA bits left in TRNG config; r1=0 |
| str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] |
| adds r5,r5,#SHA256_SUM0_OFFSET |
| @ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc) |
| ldmia r5,{r0-r3} @ load first 4 words of the 8 word SHA256 output |
| ldr r6,=rstate_sha |
| @ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc), r6=rstate_sha |
| stmia r6,{r0-r3} |
| CHK_COUNT 26,6 |
| movs r0,#0 |
| #if !HARDENING |
| strb r0,[r6] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" |
| #else |
| str r0,[r6] @ make sure rstate_sha[0] has word 0 set to 0, representing "out of data" (24-31) and 0 numbers generated (0-23) |
| #endif |
| |
| @ try to find a non-zero initialiser to create a non-degenerate LFSR random state |
| ldr r1,[r5,#16] @ SHA SUM4 |
| cbnz r1,1f @ is word 4 non-zero? then use it |
| ldr r1,[r5,#20] @ SHA SUM5 |
| cbnz r1,1f @ otherwise, is word 5 non-zero? use it |
| mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) |
| 1: |
| str r1,[r6,#rstate_lfsr-rstate_sha] |
| |
| @ try to find a non-zero initialiser to create a non-degenerate ROSC random state |
| ldr r1,[r5,#24] @ SHA SUM6 |
| cbnz r1,1f @ is word 6 non-zero? then use it |
| ldr r1,[r5,#28] @ SHA SUM7 |
| cbnz r1,1f @ otherwise, is word 7 non-zero? use it |
| mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) |
| 1: |
| ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE |
| str r1,[r2,#0] @ Initialise ROSC LFSR |
| CHK_COUNT 27,6 |
| #if HARDENING |
| ldr r3,=ROSC_RANDOM_OFFSET+ROSC_BASE |
| cbnz r1, 1f |
| rcp_panic |
| 1: |
| ldr r3, [r3] |
| rcp_iequal_nodelay r1, r3 |
| #endif |
| |
| .if GEN_RAND_SHA |
| .if SH_JITTER |
| movs r2,#0 |
| str r2,[r6,#jstate-rstate_sha] |
| .endif |
| .endif |
| |
| CHK_COUNT 28,6 |
| bx r14 |
| |
| .thumb_func |
| decrypt: |
| @ r0=4-way key, r1=IV_shareA, r2=IV_shareB, r3=message buffer, [sp]=number of blocks |
| ldr r12,[sp] @ Pop 5th argument in r12 (which we are allowed to treat as scratch according to AAPCS) |
| push {r14} |
| GET_CANARY r14,CTAG3,6 |
| #if !CALLER_INIT_RCP_COUNT |
| SET_COUNT 23,6 |
| #endif |
| push {r4-r11,r14} |
| push {r0-r3,r12} @ Save the five arguments |
| bl reset_sha_trng |
| bl init_rstate |
| @ randomly re-share the LUT contents |
| ldr r4,=lut_a |
| mov r5,#64 @ 64 words = 256 bytes |
| 1: |
| bl gen_rand_sha_nonpres |
| ldr r6,[r4,#lut_b-lut_a] @ EOR a random word into both shares |
| eors r6,r6,r0 |
| @if r0 is not EORed into only one share, then the LUT won't be right |
| str r6,[r4,#lut_b-lut_a] |
| ldr r6,[r4] |
| #if HARDENING |
| eors r7,r6,r0 |
| eors r8,r7,r6 |
| rcp_iequal_nodelay r8, r0 |
| stmia r4!,{r7} |
| #else |
| eors r6,r6,r0 |
| stmia r4!,{r6} |
| #endif |
| subs r5,r5,#1 |
| bne 1b |
| #if HARDENING |
| ldr r5,=lut_a + 256 |
| rcp_iequal_nodelay r4, r5 |
| #endif |
| CHK_COUNT 29,6 |
| #if HARDENING |
| @check again as this is quite important |
| rcp_iequal_nodelay r5, r4 |
| #endif |
| bl remap @ scramble the LUTs |
| pop {r0} @ pointer to 4way key data |
| bl init_key_4way |
| // todo alex this may trash r12; is that ok? |
| bl lock_key |
| CHK_COUNT 32,6 |
| pop {r0-r3} @ r0=IV_shareA, r1=IV_shareB, r2=message, r3=num blocks |
| bl ctr_crypt_s |
| bl randomisechaff |
| clear03 |
| pop {r4-r11,r14} |
| CHK_CANARY r14,CTAG3,6 |
| pop {r15} |
| |
| .thumb_func |
| reset_sha_trng: |
| GET_CANARY r0,CTAG19,0 |
| ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET |
| ldr r2,[r1] |
| ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS |
| orrs r2,r2,r3 |
| str r2,[r1] @ reset the SHA hardware and the TRNG hardware |
| CHK_COUNT 23,6 |
| bics r2,r2,r3 |
| str r2,[r1] @ release the reset |
| CHK_CANARY r0,CTAG19,0 |
| bx r14 |
| |
| @ Put AES core code in first scratch area |
| .section .scratch_x.aes,"ax",%progbits |
| |
| @ if GEN_RAND_SHA==0 then we don't call the counting version |
| #if HARDENING && GEN_RAND_SHA |
| check_rnd_count_func: |
| @ NOTE: we don't bother with a canary here as we don't write anything |
| ldr r0,=rstate_sha |
| ldr r0, [r0] |
| rsbs r0,r0,#0 @ Negate bottom 24 bits to get the number of calls to gen_rand_sha[_nonpres] since the last reset |
| bfc r0,#24,#8 @ |
| bx r14 |
| |
| reset_rnd_count_func: |
| push {lr} |
| GET_CANARY lr,CTAG11,0 |
| ldr r0,=rstate_sha |
| ldrb r1, [r0, #3] |
| orrs r1, #1 |
| lsls r1, #24 |
| str r1, [r0] |
| CHK_CANARY lr,CTAG11,0 |
| pop {pc} |
| #endif |
| |
| .if GEN_RAND_SHA |
| @ we need SHA256_SUM0_OFFSET==8 (see note below) |
| .if SHA256_SUM0_OFFSET!=8 |
| .err |
| .endif |
| |
| @ Return single random word in r0 |
| @ Preserves r1-r13 |
| .balign 4 |
| gen_rand_sha: |
| push {r1-r3,lr} |
| GET_CANARY r1,CTAG1,2 |
| push {r1} |
| .if SH_JITTER |
| ldr r2,=rstate_sha |
| ldr r0,[r2,#jstate-rstate_sha] |
| lsls r3,r0,#30 |
| lsrs r3,#28 |
| movs r1,#1 |
| lsls r3,r1,r3 @ 1<<(4*(r0&3)) |
| udiv r3,r3,r1 @ Takes constant + (r0&3) cycles |
| lsrs r0,r0,#2 |
| bne 1f |
| bl gen_rand_sha_nonpres |
| ldr r2,=rstate_sha |
| #if HARDENING |
| ldr r1,[r2] @ Make this (SH_JITTER) not affect rnd_count |
| adds r1,r1,#1 @ (compensating for call to gen_rand_sha_nonpres which decrements the count by 1) |
| str r1,[r2] @ The purpose is to simplify check_rnd_count calls, and to avoid having to reset jstate frequently |
| #endif |
| 1: |
| str r0,[r2,#jstate-rstate_sha] |
| .endif |
| bl gen_rand_sha_nonpres |
| pop {r1} |
| CHK_CANARY r1,CTAG1,0 |
| pop {r1-r3,pc} |
| |
| @ Return single random word in r0 |
| @ Trashes r1-r3 |
| .balign 4 |
| gen_rand_sha_nonpres: |
| push {lr} |
| GET_CANARY lr,CTAG18,0 |
| ldr r2,=rstate_sha |
| #if !HARDENING |
| ldr r3,=SHA256_BASE |
| ldrb r1,[r2] @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers) |
| subs r0,r1,#4 @ decrement it to previous SUM register |
| ble 1f @ if the offset was 4 or less we have run out of SUM register values |
| strb r0,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[] |
| ldr r0,[r3,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8 |
| #else |
| ldr r3,=SHA256_BASE |
| ldr r1,[r2] @ get word counter (8) : rand counter (24) from first word of rstate_sha[] (offset into SUM registers) |
| lsls r0, r1, #1 @ clear C (also set N which may force us down BLE path on skip of the sub below) |
| sbcs r0,r1,#0x04000000 @ decrement word counter for previous SUM register (and decrement rand counter due to C == 0) |
| str r0,[r2] @ save updated worder counter / rand_counter in bottom word of rstate_sha[] |
| asrs r1, r0, #24 |
| ble 1f @ if the offset was 4 or less we have run out of SUM register values |
| |
| ldr r2,=SHA256_BASE + 4 |
| adds r2, r1 |
| adds r1, r3, r0, asr #24 |
| ldr r0, [r2], #-4 |
| rcp_iequal_nodelay r1, r2 |
| #endif |
| b gen_rand_sha_nonpres_exit |
| 1: |
| @ [CK_JITTER code was here] |
| movs r0,#SHA256_SUM6_OFFSET+1 |
| #if !HARDENING |
| strb r0,[r2] @ reset word counter: the +1 is compensated for later |
| #else |
| strb r0,[r2,#3] @ reset word counter: the +1 is compensated for later |
| #endif |
| movw r1,#(1<<SHA256_CSR_BSWAP_LSB)+(1<<SHA256_CSR_START_LSB) |
| str r1,[r3,#SHA256_CSR_OFFSET] @ start SHA256 hardware |
| movs r0,#3 @ take four words from rstate_sha, incrementing as we go |
| ldr r1,[r2] |
| #if !HARDENING |
| adds r1,r1,#255 @ overall this adds 256 to the value in rstate_sha and resets the bottom byte to SHA256_SUM6_OFFSET |
| #else |
| adds r1,r1,#0xff000000 @ overall this adds 256 to the value in rstate_sha and resets the bottom byte to SHA256_SUM6_OFFSET |
| #endif |
| 1: |
| str r1,[r2],#4 |
| str r1,[r3,#SHA256_WDATA_OFFSET] |
| cbz r0,3f |
| ldr r1,[r2] |
| adcs r1,r1,#0 |
| sub r0,r0,#1 @ preserve the carry |
| b 1b |
| 3: |
| movs r1,#0x80 @ End of message bit (with byte-swapped endianity) = start of message padding |
| str r1,[r3,#SHA256_WDATA_OFFSET] |
| movs r1,#9 |
| 1: |
| str r0,[r3,#SHA256_WDATA_OFFSET] |
| subs r1,r1,#1 |
| bpl 1b |
| |
| lsls r2, r1, #31 @ Specifies message length = 128 bits (with byte-swapped endianity) (i.e. 0x80000000) |
| str r2,[r3,#SHA256_WDATA_OFFSET] |
| 1: |
| ldr r0,[r3,#SHA256_CSR_OFFSET] |
| #if HARDENING |
| asrs r2, #1 |
| #endif |
| lsrs r0,r0,#SHA256_CSR_SUM_VLD_LSB+1 |
| bcc 1b @ wait for hardware to finish |
| #if HARDENING |
| @ r1 is -1 from loop above |
| @ r2 is asr-ed right from 0x8000000. emperically it takes more than 6 loops, so we should have multiple 1s in the high bits |
| @ note also that if 0x80000000 was not set above correctly, r2 might not be negative |
| asrs r2, #26 |
| @ BEWARE this will fail if you step thru the above loop in the debugger as it will finish too quickly! |
| rcp_iequal_nodelay r1, r2 |
| #endif |
| ldr r0,[r3,#SHA256_SUM7_OFFSET] |
| gen_rand_sha_nonpres_exit: |
| CHK_CANARY lr,CTAG18,0 |
| pop {pc} |
| .endif |
| |
| @ simple LFSR rand versions |
| @ return a random number in r0 |
| @ This version preserves all r1-r13 |
| @ 23 or 24 cycles including branch = 23 or 24 cycles/word |
| @ (would be 20 or 21 cycles if written out) |
| .balign 4 |
| .thumb_func |
| .if !GEN_RAND_SHA |
| gen_rand_sha: |
| gen_rand_lfsr: @ Not used |
| push {r14} |
| GET_CANARY r14,CTAG2,2 |
| push {r1-r3,r14} |
| bl gen_rand_lfsr_nonpres |
| pop {r1,r3,r14} |
| CHK_CANARY r14,CTAG2,0 |
| pop {r15} |
| .endif |
| |
| @ Trashes r1,r2,r3 |
| @ 12 cycles including branch = 12 cycles/word |
| .balign 4 |
| .if !GEN_RAND_SHA |
| gen_rand_sha_nonpres: |
| .endif |
| gen_rand_lfsr_nonpres: |
| GET_CANARY r3,CTAG10,0 |
| ldr r2,=rstate_lfsr |
| ldmia r2,{r0-r1} @ r0=state_in, r1=0x1d872b41=constant for a maximum-length sequence |
| and r1,r1,r0,asr#31 @ will we be shifting out a 1? keep the constant, otherwise 0 |
| eors r0,r1,r0,lsl#1 |
| #if HARDENING |
| @ Basically r3 &= (r0 ? 0xfffffffff : 0) but still potentially perturbing r3 unless the and is skipped |
| clz r1, r0 |
| subs r1, #32 |
| asrs r1, #5 |
| ands r3, r1 |
| #endif |
| str r0,[r2] |
| CHK_CANARY r3,CTAG10,0 |
| bx r14 |
| |
| .macro loadlfsr |
| ldr r2,=rstate_lfsr |
| ldmia r2,{r0-r1} @ r0=lfsr_state, r1=lfsr_const=0x1d872b41 for a maximum-length sequence |
| .endm |
| |
| .macro steplfsr |
| ands r3,r1,r0,asr#31 @ will we be shifting out a 1? keep the constant, otherwise 0 |
| eors r0,r3,r0,lsl#1 |
| .endm |
| |
| .macro steplfsr_check |
| steplfsr |
| bne steplfsr_check\@ |
| rcp_panic |
| steplfsr_check\@: |
| .endm |
| |
| .macro savelfsr |
| str r0,[r2] |
| .endm |
| |
| .ltorg |
| |
| .balign 4 |
| .thumb_func |
| makesmallperm: |
| @ Make a uniformly random permutation of R0 bytes and stores the resulting byte array at R1 |
| @ Should be very uniform up to R0=10; maybe 11 or 12 are also OK. (10! << 2^32) |
| @ To make it valid up to R0=256, move the bl gen_rand_sha inside the loop |
| @ Uses inside-out method (slightly more efficient variant of Fisher-Yates) |
| @ Trashes r0-r3 |
| |
| push {r14} |
| GET_CANARY r14,CTAG4,6 |
| push {r4-r6,r14} |
| movs r4,r1 |
| movs r6,r0 |
| movs r1,#0 |
| movs r2,#1 |
| bl gen_rand_sha |
| |
| 1: |
| @ r1,r2=i,i+1, i=0, 2, 4, ... |
| cmp r1,r6 |
| beq 2f |
| |
| umull r0,r3,r0,r2 |
| ldrb r5,[r4,r3] |
| strb r5,[r4,r1] |
| strb r1,[r4,r3] |
| adds r1,r1,#2 |
| |
| @ r2,r1=i,i+1, i=1, 3, 5, ... |
| cmp r2,r6 |
| beq 2f |
| |
| umull r0,r3,r0,r1 |
| ldrb r5,[r4,r3] |
| strb r5,[r4,r2] |
| strb r2,[r4,r3] |
| adds r2,r2,#2 |
| |
| b 1b |
| |
| 2: |
| pop {r4-r6,r14} |
| CHK_CANARY r14,CTAG4,6 |
| pop {r15} |
| |
| .balign 4 |
| .thumb_func |
| makeperm16: |
| @ Make a random permutation of 16 things using the inside-out method (slightly more efficient variant of Fisher-Yates) |
| @ Store it in the 16 bytes at perm16 |
| @ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha |
| @ Trashes r0-r5 |
| |
| GET_CANARY r0,CTAG5,1 |
| push {r0,r14} |
| ldr r4,=perm16 |
| bl gen_rand_sha_nonpres |
| |
| @ i=0 |
| movs r1,#0 |
| movs r2,#1 @ r1,r2=i,i+1 |
| strb r1,[r4] |
| |
| @ i=1 |
| adds r1,r1,#2 @ r1,r2=i+1,i |
| umull r0,r3,r0,r1 |
| ldrb r5,[r4,r3] |
| strb r5,[r4,r2] |
| strb r2,[r4,r3] |
| |
| 1: |
| @ i=2, 4, 6, 8 |
| adds r2,r2,#2 @ r1,r2=i,i+1 |
| umull r0,r3,r0,r2 |
| ldrb r5,[r4,r3] |
| strb r5,[r4,r1] |
| strb r1,[r4,r3] |
| |
| @ i=3, 5, 7, 9 |
| adds r1,r1,#2 @ r1,r2=i+1,i |
| umull r0,r3,r0,r1 |
| ldrb r5,[r4,r3] |
| strb r5,[r4,r2] |
| cmp r1,#10 |
| strb r2,[r4,r3] |
| bne 1b |
| |
| @ refresh random number after extracting 10! from it |
| @ 10! and 16!/10! are both much less than 2^32, so the permutation will be extremely close to uniform |
| bl gen_rand_sha |
| |
| 1: |
| @ i=10, 12, 14 |
| adds r2,r2,#2 @ r1,r2=i,i+1 |
| umull r0,r3,r0,r2 |
| ldrb r5,[r4,r3] |
| strb r5,[r4,r1] |
| strb r1,[r4,r3] |
| |
| @ i=11, 13, 15 |
| adds r1,r1,#2 @ r1,r2=i+1,i |
| umull r0,r3,r0,r1 |
| ldrb r5,[r4,r3] |
| strb r5,[r4,r2] |
| cmp r1,#16 |
| strb r2,[r4,r3] |
| bne 1b |
| |
| pop {r0,r14} |
| CHK_CANARY r0,CTAG5,4 |
| bx r14 |
| |
| .balign 4 |
| .thumb_func |
| remap: |
| @ do a random remap of the LUTs |
| @ preserves r0-r11; trashes r12 |
| GET_CANARY r12,CTAG6,6 |
| push {r0-r12,r14} |
| bl gen_rand_sha_nonpres |
| ldr r1,=lut_a |
| bl remap_1 |
| bl gen_rand_sha_nonpres |
| ldr r1,=lut_b |
| bl remap_1 |
| pop {r0-r12,r14} |
| CHK_CANARY r12,CTAG6,6 |
| bx r14 |
| |
| remap_1: |
| @ r0: B0:xa B1:xb B2:ya B3:yb |
| @ r1: array of 256 bytes, followed by a 4-byte map |
| @ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0 |
| GET_CANARY r6,CTAG7,6 |
| push {r6,r14} |
| mov r14,0x01010101 |
| ubfx r6,r0,#16,#8 |
| ubfx r7,r0,#24,#8 |
| mul r6,r6,r14 @ data remaps ya and yb, byte replicated |
| mul r7,r7,r14 |
| movw r10,#0x1010 |
| and r10,r10,r0,lsl#3 @ 0/16 in each byte of r10 from b1 and b9 of r0, ready for rotates by 0 or 16 |
| mov r3,#0x7f7f7f7f |
| ubfx r2,r0,#0,#1 |
| lsl r11,r3,r2 @ 0x7f or 0xfe in each byte of r11, ready for sel of rev16 |
| ubfx r2,r0,#8,#1 |
| lsl r12,r3,r2 |
| ldr r2,[r1,#0x100] @ old map |
| eors r2,r2,r0 |
| str r2,[r1,#0x100] @ updated map |
| // todo graham; what is the effect of not doing the whole loop - is it broken if you just do some? |
| mov r2,#252 @ loop over entries |
| 1: |
| ldr r4,[r1,r2] |
| eor r3,r2,r0 |
| eor r3,r3,r0,ror#8 |
| and r3,r3,#0xfc @ r3=remapped address r2 |
| ldr r5,[r1,r3] |
| eors r5,r5,r6 @ remap data; ensure case x==0 works by doing both remaps on same side |
| eors r5,r5,r7 |
| lsr r8,r10,#8 |
| ror r5,r5,r8 @ ROR#16 is the same as eor of address with 2 |
| ror r5,r5,r10 |
| rev16 r8,r5 @ REV16 is the same as eor of address with 1 |
| uadd8 r9,r11,r11 |
| sel r5,r8,r5 |
| rev16 r8,r5 |
| uadd8 r9,r12,r12 |
| sel r5,r8,r5 |
| mul r8,r14,r2 |
| mul r9,r14,r3 |
| usub8 r8,r8,r9 @ bytewise comparison of original address and remapped address, both byte replicated |
| sel r8,r4,r5 @ swap r4 and r5 as necessary in constant time |
| str r8,[r1,r2] @ write possibly swapped values back |
| sel r8,r5,r4 |
| str r8,[r1,r3] |
| subs r2,r2,#4 |
| bpl 1b |
| pop {r6,r14} |
| CHK_CANARY r6,CTAG7,6 |
| bx r14 |
| |
| .if RK_ROR |
| |
| @ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC |
| @ Trashes r0-r12 |
| @ If i = word number 0..3, |
| @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then |
| @ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and |
| @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) |
| @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 |
| |
| .macro ref_roundkey_shares_s_impl |
| ldr r4,=rkey_s |
| loadlfsr |
| steplfsr_check @ r0=change in RKshareC |
| ldr r2,=RKshareCchange |
| str r0,[r2] |
| ldr r3,=RKshareC |
| ldr r5,[r3] |
| eors r5,r5,r0 |
| str r5,[r3] |
| @ r0=lfsr_state, r1=lfsr_const, r4=roundkey_ptr, r11=roundcounter |
| |
| ref_roundkey_shares_s_loop: |
| ldmia r4!,{r5-r8,r10} @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA |
| |
| ldr r12,[r4,#16] @ r12 = X_B=vperm+rotations of rkey shareB |
| mov r2,r12,lsr#30 @ r2 = vpermB |
| sub r9,r2,r10,lsr#30 @ r9 = vpermB - vpermA (|junk) |
| mov r2,r9,lsl#3 @ r2 = 8*(vpermB - vpermA) mod 32 |
| mov r12,r12,ror r2 |
| usub8 r12,r10,r12 @ r12 = rotsA - (rotsB ror r2) |
| |
| @ r2,r3,r10=workspace, r0=lfsr_state, r1=lfsr_const, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=rotsA, r11=roundcounter, r12=rotdiff |
| steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 |
| steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 |
| steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 |
| steplfsr_check; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] |
| |
| ldr r3,=RKshareCchange |
| ldr r3,[r3] |
| movs r2,#0 |
| usub8 r10,r2,r10 |
| ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2 |
| ror r2,r3,r10; mov r10,r10,ror#8; eors r6,r6,r2 |
| ror r2,r3,r10; mov r10,r10,ror#8; eors r7,r7,r2 |
| ror r2,r3,r10; eors r8,r8,r2 |
| |
| subs r4,r4,#20 |
| stmia r4,{r5-r8} |
| adds r4,r4,#40 |
| subs r11,r11,#1 |
| |
| bne ref_roundkey_shares_s_loop |
| #if HARDENING |
| ldr r5,=rkey_s + 40 * 15 |
| rcp_iequal_nodelay r4, r5 |
| #endif |
| ldr r2,=rstate_lfsr @ restore rstate_lfsr |
| savelfsr @ Save lfsr_state |
| clear03 24 |
| .endm |
| |
| .else // RK_ROR |
| |
| @ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC |
| @ Trashes r0-r11 |
| .macro ref_roundkey_shares_s_impl |
| ldr r4,=rkey_s |
| loadlfsr |
| steplfsr_check @ r0=change in RKshareC |
| ldr r3,=RKshareC |
| ldr r5,[r3] |
| eors r5,r5,r0 |
| str r5,[r3] |
| mov r10,r0 |
| ref_roundkey_shares_s_loop: |
| ldmia r4!,{r5-r9} @ r5-r8 = rkey shareA with vperm r9 |
| |
| @ clear03: would need to do this with, say r2,r3,r12 (reloading r2 later) |
| |
| ldr r3,[r4,#16] @ rkey shareB has a vperm of r10>>30 |
| movs r3,r3,lsr#30 |
| sub r9,r3,r9,lsr#30 @ r9 = vperm_B - vperm_A (|junk) |
| @ r3,r12=workspace, r0=lfsr_state, r1=lfsr_const, r2=rstate_lfsr, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=RKshareCchange, r11=roundcounter |
| |
| steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 |
| steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 |
| steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 |
| steplfsr_check; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2] |
| |
| subs r4,r4,#20 |
| stmia r4,{r5-r8} |
| adds r4,r4,#40 |
| subs r11,r11,#1 |
| |
| @ clear03: would need to do this with, say r3,r5-r8 |
| |
| bne ref_roundkey_shares_s_loop |
| savelfsr |
| clear03 24 |
| #if HARDENING |
| ldr r5,=rkey_s + 40 * 15 |
| rcp_iequal_nodelay r4, r5 |
| #endif |
| .endm |
| .endif |
| |
| .if INLINE_REF_ROUNDKEY_SHARES_S |
| .macro inline_ref_roundkey_shares_s |
| ref_roundkey_shares_s_starts: |
| mov r11,#15 @ there are 15 expanded keys |
| ref_roundkey_shares_s_impl |
| ref_roundkey_shares_s_end: |
| .endm |
| .else |
| .balign 4 |
| .thumb_func |
| ref_roundkey_shares_s: |
| mov r11,#15 @ there are 15 expanded keys |
| ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds |
| push {lr} |
| GET_CANARY lr,CTAG8,6 |
| ref_roundkey_shares_s_impl |
| CHK_CANARY lr,CTAG8,6 |
| pop {pc} |
| .endif |
| |
| .if RK_ROR |
| |
| @ Rotates roundkey vperms and RK_ROR rotations by random amounts |
| @ Trashes r0-r10 |
| @ If i = word number 0..3, |
| @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then |
| @ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and |
| @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) |
| @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 |
| .macro ref_roundkey_hvperms_s_impl |
| ldr r10,=rkey_s |
| ref_roundkey_hvperms_s_loop: |
| bl gen_rand_lfsr_nonpres @ r0=new vperm high|rotations |
| ldmia r10,{r2-r5,r9} @ r2-r5=roundkey share A/B, r9=old vperm high|rotations |
| str r0,[r10,#16] |
| mov r8,r0,lsr#30 @ r8=new vperm low |
| sub r6,r8,r9,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk |
| mov r8,r6,lsl#3 @ r8=8*((new vperm low)-(old vperm low)) mod 32 |
| mov r0,r0,ror r8 |
| usub8 r0,r9,r0 @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations) |
| movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 |
| movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 |
| movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 |
| movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2] |
| adds r10,r10,#20 |
| adds r7,r7,#1 |
| cmp r7, #30 |
| bne ref_roundkey_hvperms_s_loop |
| clear03 28 |
| .endm |
| |
| .else |
| |
| @ Rotates roundkey vperms by random amounts |
| @ Trashes r0-r9 |
| .macro ref_roundkey_hvperms_s_impl |
| bl gen_rand_lfsr_nonpres |
| ldr r1,=rkey_s |
| ref_roundkey_hvperms_s_loop: |
| cmp r7,#15 |
| bne 2f |
| @ Get a new random r0 after using 15 x 2 bits of the original one |
| @ Note that the junk bits (2-31) in the vperms are not adjusted independently, but that's no big loss, |
| @ and the gain is only calling gen_rand_lfsr twice instead of 30 times. |
| push {r1}; bl gen_rand_lfsr_nonpres; pop {r1} |
| 2: |
| ldmia r1,{r2-r5,r9} @ roundkey share A/B=r2-r5, vperm=r9 (including junk bits) |
| mov r8,r9,lsr#30 @ r8=old vperm (low) |
| add r6,r9,r0 @ r6=new vperm (high) | new junk |
| str r6,[r1,#16] |
| rsb r6,r8,r6,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk bits |
| ands r6,r6,#3; str r2,[r1,r6,lsl#2]; adds r6,r6,#1 |
| ands r6,r6,#3; str r3,[r1,r6,lsl#2]; adds r6,r6,#1 |
| ands r6,r6,#3; str r4,[r1,r6,lsl#2]; adds r6,r6,#1 |
| ands r6,r6,#3; str r5,[r1,r6,lsl#2] |
| adds r1,r1,#20 |
| movs r0,r0,ror#2 |
| adds r7,r7,#1 |
| cmp r7, #30 |
| bne ref_roundkey_hvperms_s_loop |
| clear03 28 |
| .endm |
| .endif |
| |
| .if INLINE_REF_ROUNDKEY_HVPERMS_S |
| .macro inline_ref_roundkey_hvperms_s |
| ref_roundkey_hvperms_s_starts: |
| movs r7,#0 |
| ref_roundkey_hvperms_s_impl |
| ref_roundkey_hvperms_s_end: |
| .endm |
| .else |
| .balign 4 |
| .thumb_func |
| ref_roundkey_hvperms_s: |
| movs r7,#0 |
| ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 15 rounds |
| GET_CANARY r0,CTAG9,6 |
| push {r0, lr} |
| ref_roundkey_hvperms_s_impl |
| pop {r0} |
| CHK_CANARY r0,CTAG9,6 |
| pop {pc} |
| .endif |
| |
| .ltorg |
| |
| .if ST_VPERM |
| .balign 4 |
| .thumb_func |
| @ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount |
| @ given in the bottom two bits of R0 and update the rotation recorded at statevperm. |
| @ On entry R1 must point to statevperm. |
| @ Trashes r0-r3,r12 |
| @ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ... |
| @ r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ... |
| @ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise. |
| addstatevperm: |
| push {r14} |
| GET_CANARY r14,CTAG20,0 |
| ldr r2,[r1] |
| adds r2,r2,r0 |
| str r2,[r1] |
| |
| ldr r1,=shareA |
| ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1 |
| ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1 |
| ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1 |
| ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1 |
| ldmia r1,{r4-r7} |
| |
| getchaffaddress r12 @ Overwrite temporary storage with random numbers |
| ldmia r12!,{r2,r3} |
| stmia r1!,{r2,r3} |
| ldmia r12!,{r2,r3} |
| stmia r1!,{r2,r3} |
| |
| ldr r1,=shareB |
| ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1 |
| ands r0,r0,#3; str r9, [r1,r0,lsl#2]; adds r0,r0,#1 |
| ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1 |
| ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1 |
| ldmia r1,{r8-r11} |
| |
| getchaffaddress r0,16 @ Overwrite temporary storage with random numbers |
| ldmia r0!,{r2,r3} |
| stmia r1!,{r2,r3} |
| ldmia r0!,{r2,r3} |
| stmia r1!,{r2,r3} |
| |
| addstatevperm_exit: @ label exit point to be to able to specify to analysis code |
| CHK_CANARY r14,CTAG20,0 |
| pop {pc} |
| .endif |
| |
| @ Conjugate lut_a, lut_b with (state) shareC |
| @ I.e., EOR the input and output with shareC. |
| @ We need to pick one input for each share A and B, and one output for ONE of the shares A and B |
| @ Arbitrarily choosing a0, b1 and d0 |
| .balign 4 |
| conjshareC: |
| push {r14} |
| GET_CANARY r14,CTAG21,0 |
| .if ST_SHAREC |
| ldr r1,=shareA |
| ldr r0,[r1, #shareC-shareA] @ Get shareC as a word (all bytes the same) |
| ldr r1,=lut_a @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs... |
| ldr r2,[r1,#0x100] |
| eors r2,r2,r0,lsr#24 |
| str r2,[r1,#0x100] |
| movs r0,r0,lsr#16 |
| ldr r1,=lut_b @ ... (continued) Here we're EORing share C into a0, b1 and d0. |
| ldr r2,[r1,#0x100] |
| eors r2,r2,r0,lsl#8 |
| str r2,[r1,#0x100] |
| .endif |
| CHK_CANARY r14,CTAG21,0 |
| pop {pc} |
| |
| .macro shift_rows_s_impl |
| @ First "rotate" the two most-significant bytes of the state by two registers |
| @ Trashes r0-r3 |
| @ Slightly faster (but not shorter?) with ubfx/bfi |
| eors r0,r4,r6 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta; |
| lsrs r0,r0,#16 |
| lsls r0,r0,#16 |
| eors r4,r4,r0 |
| eors r6,r6,r0 |
| eors r0,r5,r7 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta; |
| lsrs r0,r0,#16 |
| lsls r0,r0,#16 |
| eors r5,r5,r0 |
| eors r7,r7,r0 |
| @ next "rotate" the two odd-significance bytes of the state by one register |
| eors r1,r7,r4 @ tb=state[3]^state[0]; tb&=0xff00ff00; |
| ands r1,r1,#0xff00ff00 |
| eors r0,r4,r5 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta; |
| ands r0,r0,#0xff00ff00 |
| eors r4,r4,r0 |
| eors r0,r5,r6 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta; |
| ands r0,r0,#0xff00ff00 |
| eors r5,r5,r0 |
| eors r0,r6,r7 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; |
| ands r0,r0,#0xff00ff00 |
| eors r6,r6,r0 |
| eors r7,r7,r1 @ state[3]^=tb; |
| @ repeat for other share, conjugated by ror#16 |
| clear01 @ barrier |
| eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0x0000ffff; state[0]^=ta; state[2]^=ta; |
| lsls r0,r0,#16 |
| lsrs r0,r0,#16 |
| eors r8,r8,r0 |
| eors r10,r10,r0 |
| eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0x0000ffff; state[1]^=ta; state[3]^=ta; |
| lsls r0,r0,#16 |
| lsrs r0,r0,#16 |
| eors r9,r9,r0 |
| eors r11,r11,r0 |
| eors r1,r11,r8 @ tb=state[3]^state[0]; tb&=0xff00ff00; |
| ands r1,r1,#0xff00ff00 |
| eors r0,r8,r9 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta; |
| ands r0,r0,#0xff00ff00 |
| eors r8,r8,r0 |
| eors r0,r9,r10 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta; |
| ands r0,r0,#0xff00ff00 |
| eors r9,r9,r0 |
| eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; |
| ands r0,r0,#0xff00ff00 |
| eors r10,r10,r0 |
| |
| eors r11,r11,r1 @ state[3]^=tb; |
| |
| clear01 @ barrier |
| .endm |
| |
| .if INLINE_SHIFT_ROWS_S |
| .macro inline_shift_rows_s |
| shift_rows_s_starts: |
| shift_rows_s_impl |
| shift_rows_s_end: |
| .endm |
| .else |
| .balign 4 |
| .thumb_func |
| @ Not going to use canaries here as it doesn't write anything - could be use to perturb register values, but not super worried about that yet |
| shift_rows_s: |
| shift_rows_s_impl |
| bx r14 |
| .endif |
| |
| @ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1 |
| @ r0x00 is a register holding 0x00000000; r0x1b is a register holding 0x1b1b1b1b |
| .macro mixcol rx,rt,ru,r0x00,r0x1b |
| @ let rx=(a,b,c,d) |
| uadd8 \rt,\rx,\rx @ MSB of each byte into the GE flags |
| sel \ru,\r0x1b,\r0x00 @ get bytewise correction for bytewise field multiplication by 2 |
| eors \rt,\rt,\ru @ (2a,2b,2c,2d) |
| |
| eors \ru,\rt,\rx @ (3a,3b,3c,3d) |
| eors \rt,\rt,\rx,ror#24 @ (2a+b,2b+c,2c+d,2d+a) |
| eors \rt,\rt,\rx,ror#16 @ (2a+b+c,2b+c+d,2c+d+a,2d+a+b) |
| eors \rx,\rt,\ru,ror#8 @ (2a+b+c+3d,2b+c+d+3a,2c+d+a+3b,2d+a+b+3c) |
| .endm |
| |
| @ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1 |
| .macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b |
| uadd8 \rt,\rx,\rx @ field multiplication by 2 as above |
| sel \rw,\r0x1b,\r0x00 |
| eors \rt,\rt,\rw @ 2x |
| uadd8 \ru,\rt,\rt |
| sel \rw,\r0x1b,\r0x00 |
| eors \ru,\ru,\rw @ 4x |
| uadd8 \rv,\ru,\ru |
| sel \rw,\r0x1b,\r0x00 |
| eors \rv,\rv,\rw @ 8x |
| |
| eors \rx,\rx,\rv @ 9x |
| eors \rw,\rx,\rt @ 11x |
| eors \rw,\rw,\rx,ror#16 @ 11x ^ 9x ROL #16 |
| eors \rx,\rx,\ru @ 13x |
| eors \rw,\rw,\rx,ror#8 @ 11x ^ 9x ROL #16 ^ 13x ROL #24 |
| eors \rt,\rt,\ru @ 6x |
| eors \rt,\rt,\rv @ 14x |
| eors \rx,\rt,\rw,ror#8 @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24 |
| .endm |
| |
| .balign 4 |
| .thumb_func |
| @ Not going to use canaries here as it doesn't write anything - could be use to perturb register values, but not super worried about that yet |
| @ Trashes r0-r3,r12 |
| mix_cols_s: |
| mov r2,#0x00000000 |
| mov r3,#0x1b1b1b1b |
| mixcol r4 ,r0,r1,r2,r3 @ apply mixcol to each state word |
| mixcol r5 ,r0,r1,r2,r3 |
| mixcol r6 ,r0,r1,r2,r3 |
| mixcol r7 ,r0,r1,r2,r3 |
| ldr r12,=chaff |
| ldmia r12!,{r0,r1} @ overwrite sensitive shareA-related quantities r0,r1 with random numbers |
| mixcol r8 ,r0,r1,r2,r3 |
| mixcol r9 ,r0,r1,r2,r3 |
| mixcol r10,r0,r1,r2,r3 |
| mixcol r11,r0,r1,r2,r3 |
| ldmia r12!,{r0,r1} @ overwrite sensitive shareB-related quantities r0,r1 with random numbers |
| bx r14 |
| |
| @ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups) |
| .macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3 |
| ubfx \Rspare0,\Rtarg,#0, #8 |
| ubfx \Rspare1,\Rtarg,#8, #8 |
| ubfx \Rspare2,\Rtarg,#16, #8 |
| ubfx \Rspare3,\Rtarg,#24, #8 |
| |
| ldrb \Rspare0,[\Rtable,\Rspare0] |
| ldrb \Rspare1,[\Rtable,\Rspare1] |
| ldrb \Rspare2,[\Rtable,\Rspare2] |
| ldrb \Rspare3,[\Rtable,\Rspare3] |
| orr \Rspare0,\Rspare0,\Rspare1,lsl#8 |
| orr \Rspare2,\Rspare2,\Rspare3,lsl#8 |
| orr \Rtarg,\Rspare0,\Rspare2,lsl#16 |
| .endm |
| |
| @ map all bytes of the state through the split LUT, lut_a and lut_b |
| @ Trashes r0-r3,r12 |
| .macro map_sbox_s_impl |
| ldr r0,=shareA @ Write out state share A to memory |
| @ stmia r0,{r4-r7} @ Used to do a STM |
| getchaffaddress r1 |
| ldr r2,[r1] |
| str r4,[r0] @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms, |
| str r2,[r1] @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired |
| str r5,[r0,#4] @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic. |
| str r2,[r1] @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1. |
| str r6,[r0,#8] @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but |
| str r2,[r1] @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic. |
| str r7,[r0,#12] |
| str r2,[r1] |
| |
| ldr r0,=shareB @ Write out state share B to memory |
| stmia r0,{r8-r11} @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with |
| |
| bl makeperm16 @ Rebuild random 16-way permutation. Maybe do this less frequently |
| @ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation |
| |
| bl gen_rand_sha_nonpres |
| mov r11,r0 |
| ldr r8,=lut_a |
| ldr r9,=lut_b |
| ldr r0,[r8,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) |
| eors r3,r0,r0,lsr#8 @ R3 = a0^a1 | junk |
| uxtb r10,r3 |
| ldr r1,[r9,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) |
| eors r1,r0,r1 |
| eors r2,r1,r1,lsr#8 |
| movs r12,r1,lsr#16 @ R12 = c0^d0 | (c1^d1)<<8 |
| bfi r12,r2,#16,#8 @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 |
| |
| ldr r4,=perm16 |
| ldr r5,=shareA |
| ldr r6,=shareB |
| movs r1,#0;movs r2,#0;movs r3,#0 |
| @ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 |
| movs r0,#15 |
| 1: @ (Ordering instructions to minimise result delays) |
| ldrb r1,[r4,r0] @ r1 = perm[r0] |
| mov r11,r11,ror#11 @ Rotate random 32 bits to present a new low 8 bits |
| eors r7,r1,#2 @ r7 = perm[r0]^2 |
| ldrb r2,[r5,r1] @ r2 = shareA[perm[r0]] |
| eor r11,r11,r2,ror#8 @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted) |
| ldrb r3,[r6,r7] @ r3 = shareB[perm[r0]^2] |
| eor r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 |
| eors r2,r2,r3 @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2] |
| ldrb r3,[r8,r2] @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]] |
| eor r2,r2,r12,lsr#16 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] |
| eor r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) |
| eor r3,r3,r11 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8) |
| strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand |
| ldrb r3,[r9,r2] @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]] |
| subs r0,r0,#1 |
| eor r3,r3,r11 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand |
| eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8) |
| strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 |
| bpl 1b |
| clear03 8 @ barrier |
| |
| ldmia r6,{r8-r11} @ Read state share B back from memory |
| clear03 12 @ barrier |
| getchaffaddress r0,16 |
| bfi r0,r5,#0,#4 @ match chaff pointer (r0) to share A location (R5) mod 16 |
| @ldmia r5,{r4-r7} @ Read state share A back from memory |
| @clear03 16 @ barrier |
| ldr r4,[r5] @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s |
| ldr r1,[r0] |
| ldr r6,[r5,#8] |
| ldr r1,[r0,#8] |
| ldr r7,[r5,#12] |
| ldr r1,[r0,#12] |
| ldr r5,[r5,#4] @ Do r5 last because it's the address register |
| ldr r1,[r0,#4] |
| |
| @ Refresh state shares because luts only give imperfect share-by-value |
| @ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent) |
| @ loadlfsr |
| @ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc |
| @ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16 |
| @ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16 |
| @ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16 |
| @ savelfsr |
| .endm |
| |
| .if INLINE_MAP_SBOX_S |
| .macro inline_map_sbox_s |
| map_sbox_s_starts: |
| // push {lr} |
| map_sbox_s_impl |
| // pop {lr} |
| map_sbox_s_end: |
| .endm |
| .else |
| .balign 4 |
| .thumb_func |
| map_sbox_s: |
| GET_CANARY r12,CTAG12,3 |
| push {r12,r14} |
| |
| map_sbox_s_impl |
| |
| pop {r12,r14} |
| CHK_CANARY r12,CTAG12,5 |
| bx r14 |
| .endif |
| |
| .ltorg |
| |
| .balign 4 |
| .thumb_func |
| randomisechaff: |
| @ Randomise 48 bytes of chaff values (random load values) |
| @ Uses 12 bytes of permscratch |
| @ Trashes r0-3 |
| GET_CANARY r0,CTAG13,6 |
| push {r0,r14} |
| movs r0,#12 |
| ldr r1,=permscratch |
| bl makesmallperm @ Store the random words in a random order to make 2nd order attacks harder |
| movs r1,#11 |
| 1: |
| push {r1} |
| bl gen_rand_sha_nonpres |
| pop {r1} |
| ldr r2,=permscratch |
| ldrb r2,[r2,r1] |
| getchaffaddress r3 |
| str r0,[r3,r2,lsl#2] |
| subs r1,r1,#1 |
| bpl 1b |
| pop {r0,r14} |
| CHK_CANARY r0,CTAG13,6 |
| bx r14 |
| |
| .balign 4 |
| refreshchaff_and_lfsr: |
| @ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff |
| @ Re-randomise LFSR with SHA |
| @ Uses 12 bytes of permscratch |
| @ Trashes r0-3,12 |
| GET_CANARY r0,CTAG14,6 |
| push {r0,r14} |
| |
| @ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence |
| bl gen_rand_sha_nonpres |
| ldr r1,=rstate_lfsr |
| ldr r2,[r1] |
| 1: |
| adds r2,r2,r0 |
| @ note that r2 should not be 0 on entry, so both |
| @ r2 + r0, and r2 + r0 + r0 on the next loop should not both be 0 |
| @ if they are, we will loop |
| beq 1b @ Don't update LFSR state to 0 |
| #if HARDENING |
| beq 1b |
| #endif |
| str r2,[r1] |
| |
| @ Choose a random order to update chaff words to make 2nd order attacks harder |
| movs r0,#12 |
| ldr r1,=permscratch |
| bl makesmallperm |
| |
| movs r1,#11 |
| 1: |
| push {r1} |
| bl gen_rand_lfsr_nonpres |
| pop {r1} |
| ldr r2,=permscratch |
| ldr r3,=chaff |
| ldrb r2,[r2,r1] |
| ldr r12,[r3,r2,lsl#2] |
| add r0,r0,r12 |
| str r0,[r3,r2,lsl#2] |
| subs r1,r1,#1 |
| bpl 1b |
| pop {r0,r14} |
| CHK_CANARY r0,CTAG14,6 |
| bx r14 |
| |
| .balign 4 |
| .thumb_func |
| @ Do sbox on the four bytes of the 4-way share r4-r7 |
| @ Trashes r0,r8-r12 |
| init_key_sbox: |
| GET_CANARY r12,CTAG15,6 |
| push {r1-r3,r12,r14} |
| bl gen_rand_sha_nonpres; mov r8,r0 |
| bl gen_rand_sha_nonpres; mov r9,r0 |
| bl gen_rand_sha_nonpres; mov r10,r0 |
| bl gen_rand_sha_nonpres; mov r11,r0 |
| ldr r0,=fourway @ Write out 4-way share to memory |
| stmia r0,{r8-r11} @ Save random values first to obscure saving of state |
| stmia r0,{r4-r7} |
| movs r4,#0 @ Clear r4-r7 so that they don't interact with makesmallperm |
| movs r5,#0 |
| movs r6,#0 |
| movs r7,#0 |
| |
| bl randomisechaff @ Randomise block of memory mainly used for obscuring loads |
| |
| movs r0,#4 |
| ldr r1,=permscratch |
| bl makesmallperm @ Build random 4-way permutation determining order of bytes to be SBOXed |
| ldr r1,=permscratch @ Write out random addresses in advance to save two registers (reusing permscratch) |
| ldr r4,[r1] |
| ldr r0,=fourway |
| uxtab r5,r0,r4 |
| uxtab r6,r0,r4,ror#8 |
| uxtab r7,r0,r4,ror#16 |
| uxtab r8,r0,r4,ror#24 |
| stmia r1,{r5-r8} @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] |
| |
| bl gen_rand_sha @ Save some randomness for the resharing operation later |
| movs r7,r0 |
| bl gen_rand_sha |
| movs r8,r0 |
| |
| ldr r2,=lut_a |
| ldr r3,=lut_b |
| ldr r0,[r2,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) |
| eors r10,r0,r0,lsr#8 |
| uxtb r10,r10 @ R10 = a0^a1 |
| ldr r1,[r3,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) |
| eors r1,r0,r1 |
| eors r4,r1,r1,lsr#8 |
| uxtb r11,r4 @ R11 = a0^a1^b0^b1 |
| eor r10,r10,r11,lsl#8 @ R10 = a0^a1 | (a0^a1^b0^b1)<<8 |
| movs r12,r1,ror#16 @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24 |
| |
| ldr r1,=permscratch |
| ldr r11,=chaff |
| @ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk |
| 1: |
| ands r5,r1,#12 |
| adds r5,r11,r5 @ Align chaff address to r1 |
| ldr r6,[r1],#4 @ r6 = fourway + perm[i] (i=0-3, loop iteration) |
| ldr r5,[r5] @ Random load to mask previous load |
| |
| ands r9,r6,#12 |
| add r9,r11,r9 @ r9 = chaff address aligned to (r6 bic 3) mod 16 |
| ldrb r4,[r6,#0] |
| ldr r14,[r9,#0] @ Random load to mask previous load |
| eor r4,r4,r10 |
| eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 |
| |
| ldrb r5,[r6,#4] |
| ldr r14,[r9,#4] @ Random load to mask previous load |
| eors r4,r4,r5 |
| eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 |
| |
| ldrb r5,[r6,#8] |
| ldr r14,[r9,#8] @ Random load to mask previous load |
| eors r4,r4,r5 |
| eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 |
| |
| ldrb r5,[r6,#12] |
| ldr r14,[r9,#12] @ Random load to mask previous load |
| eors r4,r4,r5 @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk |
| eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 |
| |
| ands r14,r4,#255 |
| ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] |
| and r14,r4,#15 |
| add r14,r14,#32 |
| ldrb r14,[r11,r14] @ Random load to mask previous load (r2 and r11 are both 0 mod 16) |
| eors r5,r5,r12 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24 |
| @ split r5 into two shares and store at [r6,#0] and [r6,#4] |
| strb r7,[r6,#0] |
| eors r5,r5,r7 |
| strb r5,[r6,#4] |
| |
| mov r5,r10,lsr#8 @ r5=a0^a1^b0^b1 |
| ldr r14,[r11,#44] @ Need to eor into a random destination register |
| eors r14,r4,r5 @ r14 = unsharedbyte[perm[i]]^b0^b1 | junk<<8 |
| and r14,r14,#255 |
| |
| ldrb r5,[r3,r14] @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1] |
| and r14,r14,#15 |
| add r4,r11,#24 |
| ldrb r14,[r4,r14] @ Random load to mask previous load (r3==8 and r11==0 mod 16) |
| eor r5,r5,r12,ror#8 @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24 |
| @ split r5 into two shares and store at [r6,#8] and [r6,#12] |
| strb r8,[r6,#8] |
| eors r5,r5,r8 |
| strb r5,[r6,#12] |
| |
| movs r7,r7,ror#8 |
| movs r8,r8,ror#8 |
| |
| tst r1,#12 @ This does 4 loop iterations because permscratch is guaranteed to be 0 mod 16 |
| bne 1b |
| |
| ldr r0,=fourway |
| ldmia r0,{r4-r7} @ Load SBOXed values back into register r4-r7 |
| ldmia r11,{r8-r12,r14} @ Random load to mask previous load and to obfuscate registers |
| |
| pop {r1-r3,r12,r14} |
| CHK_CANARY r12,CTAG15,6 |
| bx r14 |
| |
| .balign 4 |
| .thumb_func |
| @ r1 = pointer to 4 x 4-way share (16 words); left unchanged |
| @ r3 = rkey_s+40*roundkeynumber; advanced by 40 |
| @ Trashes r8-r12 |
| @ If i = word number 0..3, |
| @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then |
| @ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and |
| @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4]) |
| @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16 |
| storeroundkey: |
| GET_CANARY r8,CTAG16,6 |
| push {r2,r8,r14} |
| |
| @ eor two 4-way share components to make a component of a 2-way share |
| @ Note that we load from 4-way share at a random address then convert to 2-way share and |
| @ store at a fixed address, rather than the other way around, so that 2-way shares are obscured |
| @ by vperm (we don't know which 2-way share is being processed at a particular point in time). |
| @ And (if RK_ROR) we rotate first before EORing down to 2-way, so there is never an unrotated 2-way share |
| |
| bl gen_rand_sha @ Get r0 = vperm for shareA of the round key |
| str r0,[r3,#16] |
| mov r8,r0,lsr#30 |
| rsb r8,r8,#0 @ r8=-vperm |
| .if RK_ROR |
| movs r2,#0 |
| usub8 r2,r2,r0 @ r2=-hperms |
| .endif |
| mov r9,#4 |
| 1: |
| and r8,r8,#3 |
| adds r0,r1,r8,lsl#4 |
| |
| ldmia r0,{r10,r11} |
| .if RK_ROR |
| mov r10,r10,ror r2 |
| mov r11,r11,ror r2 |
| movs r2,r2,ror#8 |
| .endif |
| eor r10,r10,r11 |
| str r10,[r3],#4 |
| add r8,r8,#1 |
| subs r9,r9,#1 |
| bne 1b |
| |
| adds r1,r1,#8 |
| adds r3,r3,#4 @ skip over vperm (already stored) |
| |
| bl gen_rand_sha @ Get r0 = vperm for shareB of the round key |
| str r0,[r3,#16] |
| mov r8,r0,lsr#30 |
| rsb r8,r8,#0 @ r8=-vperm |
| .if RK_ROR |
| movs r2,#0 |
| usub8 r2,r2,r0 @ r2=-hperms |
| .endif |
| mov r9,#4 |
| ldr r12,=RKshareC |
| ldr r12,[r12] |
| 1: |
| and r8,r8,#3 |
| adds r0,r1,r8,lsl#4 |
| ldmia r0,{r10,r11} |
| eor r10,r10,r12 @ Mix in RKshareC into round key shareB |
| .if RK_ROR |
| mov r10,r10,ror r2 |
| mov r11,r11,ror r2 |
| movs r2,r2,ror#8 |
| .endif |
| mov r10,r10,ror#16 |
| mov r11,r11,ror#16 |
| eor r10,r10,r11 |
| str r10,[r3],#4 |
| add r8,r8,#1 |
| subs r9,r9,#1 |
| bne 1b |
| |
| subs r1,r1,#8 @ Restore r1 = (r1 on entry) |
| adds r3,r3,#4 @ Set r3 = (r3 on entry) + 40 |
| |
| pop {r2,r8,r14} |
| CHK_CANARY r8,CTAG16,6 |
| bx r14 |
| |
| .balign 4 |
| .thumb_func |
| init_key_4way: |
| @ On entry, r0 points to 4-way shared raw key data (64 bytes, 64 byte gap for FIB workaround, then other 64 bytes) |
| @ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 |
| @ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K. |
| @ |
| @ On exit, rkeys_s, a 40*15=600-byte region, is filled as follows. |
| @ Each of the 15 round keys is represented as two 5-word regions rka[0..4] and rkb[0..4], |
| @ each of which consists of 4 words of round key followed by a word encoding vperm and rotation (RK_ROR) information. |
| @ In addition a common share word, RKshareC, is set randomly. |
| @ For a given round, rk[i] = the i^th word of the actual round key is given by: |
| @ vpermA=rka[4]>>30 |
| @ vpermB=rkb[4]>>30 |
| @ rka_unrot[i] = rka[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of rka[4]) |
| @ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16 |
| @ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC |
| |
| GET_CANARY r12,CTAG17,6 |
| push {r0-r12,r14} |
| |
| @ Transfer 4-way key into local workspace, rerandomising the shares |
| mov r5,r0 @ r5=4-way key input |
| bl randomisechaff |
| ldr r6,=rkey4way |
| movs r7,#8 |
| 1: |
| #if FIB_WORKAROUND |
| cmp r7,#4 |
| bne 2f |
| adds r5,#64 @ Skip 64 byte gap for FIB workaround |
| 2: |
| #endif |
| ldmia r5!,{r1-r4} |
| bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0 |
| bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0 |
| bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0 |
| stmia r6!,{r1-r4} |
| subs r7,r7,#1 |
| bne 1b |
| |
| @ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for |
| @ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys. |
| bl gen_rand_sha_nonpres |
| ldr r12,=RKshareC |
| str r0,[r12] @ Make RKshareC random word |
| ldr r3,=rkey_s @ r3=rkey_s |
| ldr r1,=rkey4way @ r1=rkey4way |
| bl storeroundkey @ Store round key 0 and advance r3 by 40 |
| adds r1,r1,#64 |
| bl storeroundkey @ Store round key 1 and advance r3 by 40 |
| adds r1,r1,#48 |
| ldmia r1!,{r4-r7} @ r4-r7 = 4-way share of previous round key word |
| @ r1=rkey4way+128 on entry to main loop |
| movs r2,#0 @ r2=word counter (0-51), offset from word 8 |
| |
| @ Note that r1-r3 are not sensitive values, so it's safe to stack |
| @ them and conditionally branch on them. |
| |
| @ rkey4way = 8 x 4 consecutive 4-way share words as cyclic buffer of |
| @ Rounds 0,1 Rounds 2,3 Rounds 12,13 Round 14 |
| @ a0 b0 c0 d0 -> a8 b8 c8 d8 -> ... -> a48 b48 c48 d48 -> a56 b56 c56 d56 |
| @ a1 b1 c1 d1 -> a9 b9 c9 d9 a49 b49 c49 d49 a57 b57 c57 d57 |
| @ a2 b2 c2 d2 etc a50 b50 c50 d50 a58 b58 c58 d58 |
| @ a3 b3 c3 d3 a51 b51 c51 d51 a59 b59 c59 d59 |
| @ a4 b4 c4 d4 a52 b52 c52 d52 =============== |
| @ a5 b5 c5 d5 a53 b53 c53 d53 |
| @ a6 b6 c6 d6 a54 b54 c54 d54 |
| @ a7 b7 c7 d7 a55 b55 c55 d55 |
| |
| init_key_expandloop: |
| @ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8) |
| @ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words) |
| @ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4]) |
| @ r4-r7 = 4-way share of previous roundkey word |
| |
| tst r2,#7 |
| bne 1f |
| subs r1,r1,#128 @ Every 8th word, reset cyclic buffer pointer and do ROTWORD |
| movs r4,r4,ror#8 |
| movs r5,r5,ror#8 |
| movs r6,r6,ror#8 |
| movs r7,r7,ror#8 |
| 1: |
| |
| tst r2,#3 |
| bne 1f |
| bl init_key_sbox @ Every 4th word, do SUBBYTES (sbox) on r4-r7 |
| 1: |
| |
| tst r2,#7 |
| bne 1f |
| movs r0,r2,lsr#3 |
| mov r8,#1 |
| movs r8,r8,lsl r0 |
| eors r4,r4,r8 @ Every 8th word, add in round constant |
| 1: |
| |
| ldmia r1,{r8-r11} @ eor with key from two rounds ago and advance r1 by 16 |
| eors r4,r4,r8 |
| eors r5,r5,r9 |
| eors r6,r6,r10 |
| eors r7,r7,r11 |
| stmia r1!,{r4-r7} |
| |
| add r2,r2,#1 |
| tst r2,#3 |
| bne 1f |
| subs r1,r1,#64 |
| bl storeroundkey @ Store round key 1+r2/4 and advance r3 by 40 |
| adds r1,r1,#64 |
| 1: |
| |
| cmp r2,#52 |
| bne init_key_expandloop |
| |
| CHK_COUNT 30,6 |
| pop {r0-r12,r14} |
| CHK_CANARY r12,CTAG17,6 |
| bx r14 |
| |
| .ltorg |
| |
| @ Add the round key shares pointed to by r12 into the state shares |
| @ Trashes r0-r3 |
| .balign 4 |
| addrkey_s: |
| |
| ldr r0,=chaff @ guaranteed 0 mod 16 |
| .if ST_VPERM |
| ldr r3,=statevperm |
| ldr r3,[r3] @ r3=vperm state rotation in bottom two bits |
| ldr r2,[r0,#12] @ barrier load |
| .else |
| movs r3,#0 |
| .endif |
| bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 |
| ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits |
| ldr r2,[r0,#16] @ barrier load |
| |
| rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot |
| @ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot |
| @ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr |
| .if RK_ROR |
| movs r0,r2,lsl#3 |
| movs r1,r1,ror r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; rors r0,r0,r1; eors r4,r4,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0 |
| .else |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r4,r4,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r5,r5,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r6,r6,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r7,r7,r0 |
| .endif |
| clear03_preserve_r3 |
| add r12,r12,#20 |
| @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr |
| |
| bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 |
| ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits |
| ldr r2,[r0,#16] @ barrier load |
| rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot |
| ldr r3,=RKshareC @ r3=common round key shareC |
| bfi r0,r3,#0,#4 |
| ldr r3,[r3] |
| ldr r0,[r0] @ barrier load |
| |
| @ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot |
| @ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr |
| .if RK_ROR |
| movs r0,r2,lsl#3 |
| movs r1,r1,ror r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; rors r0,r0,r1; eor r8,r8,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r9,r9,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r10,r10,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eor r11,r11,r0 |
| .else |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; eors r8,r8,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; eors r9,r9,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; eors r10,r10,r0 |
| ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; eors r11,r11,r0 |
| .endif |
| clear03 |
| bx r14 |
| |
| .balign 4 |
| .thumb_func |
| @ de/encrypt data in place |
| @ r0: ivec |
| @ r1: buf |
| @ r2: n, number of blocks, n>0 |
| .if CT_BPERM |
| @ In AES-CTR each block can be independently en/decrypted as the encryption only depends on the IV, |
| @ the key, and the block number. We can therefore process them in any order, and using a |
| @ random order helps to defeat attacks that work on the output of the AES, since an attacker |
| @ wouldn't know what plaintext or ciphertext corresponds to a particular instruction. |
| .endif |
| |
| ctr_crypt_s: |
| @ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks |
| GET_CANARY r12,CTAG0,6 |
| push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets |
| |
| push {r0-r3} |
| |
| #if !CALLER_INIT_RCP_COUNT |
| SET_COUNT 33,6 |
| #endif |
| |
| .if CT_BPERM |
| @ Initialise 32 random numbers (which fit in half-words) |
| @ r3=number of blocks |
| ldr r4,=bperm_rand |
| movs r5,#32 |
| 1: |
| bl gen_rand_sha |
| umull r0,r2,r0,r3 @ Random number between 0 and n-1 (n=#blocks) |
| strh r2,[r4],#2 |
| subs r5,r5,#1 |
| bne 1b |
| .endif |
| |
| bl randomisechaff |
| |
| @ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0 |
| @ Not doing shareC or state vperm at this point |
| pop {r0} |
| ldmia r0,{r4-r7} @ r4-r7 = IVshareA |
| clear03 16 |
| pop {r1} |
| ldmia r1,{r8-r11} @ r8-r11 = IVshareB |
| clear03 32 |
| bl gen_rand_sha_nonpres; eors r4,r4,r0; movs r1,#0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc |
| bl gen_rand_sha_nonpres; eors r5,r5,r0; movs r1,#0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16 |
| bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16 |
| bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16 |
| ldr r0,=IV0 |
| stmia r0!,{r4-r7} |
| adds r1,r0,#4 |
| stmia r1,{r8-r11} |
| @ "Decommission" IV0 so that it doesn't get stacked |
| #if 1 // approved by Alex - no side channel leakage it seems |
| #if HARDENING |
| // if this is skipped, r4 is likely random, so more 1 in 4 chance that ldmia will trap |
| // in any case very unlikely to load useful data below (and presuambly the faulting address is uninteresting |
| // since it is already XORed with random data above) |
| movs r0, #32 |
| // note if r1 is unset, then we are reading from lut_a |
| movs r1, #0 |
| ldmia r1!, {r4, r5, r6, r7, r8, r9, r10, r11} |
| rcp_iequal_nodelay r0, r1 |
| #else |
| movs r0, #0 |
| ldmia r0, {r4, r5, r6, r7, r8, r9, r10, r11} |
| #endif |
| #else |
| bl gen_rand_sha_nonpres; movs r4,r0 |
| bl gen_rand_sha_nonpres; movs r5,r0 |
| bl gen_rand_sha_nonpres; movs r6,r0 |
| bl gen_rand_sha_nonpres; movs r7,r0 |
| bl gen_rand_sha_nonpres; mov r8,r0 |
| bl gen_rand_sha_nonpres; mov r9,r0 |
| bl gen_rand_sha_nonpres; mov r10,r0 |
| bl gen_rand_sha_nonpres; mov r11,r0 |
| #endif |
| @ Trashes r0, r1 |
| check_rnd_count (RND_COUNT_decrypt+RND_COUNT_ctr_crypt_s_init) |
| pop {r1,r2} |
| @ r1=cipher/plaintext buffer, r2=number of blocks |
| |
| movs r3,#0 |
| CHK_COUNT 33,6 |
| |
| ctr_crypt_mainloop: |
| SET_COUNT 80,6 |
| @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter |
| |
| @ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) |
| push {r1-r3} |
| @ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret) |
| |
| @ Trashes r0, r1 |
| reset_rnd_count_checked |
| |
| // no point in having a branch if we should never take it (hardening/size fail) |
| #if REFCHAFF_PERIOD != 1 |
| tst r3,#(REFCHAFF_PERIOD-1) |
| bne 1f |
| #endif |
| bl refreshchaff_and_lfsr |
| 1: |
| ldr r3,[sp,#8] @ get block count off the stack |
| // no point in having a branch if we should never take it (hardening/size fail) |
| #if REMAP_PERIOD != 1 |
| tst r3,#(REMAP_PERIOD-1) |
| bne 1f |
| #endif |
| bl remap @ shuffle the LUTs; this preserves R3 |
| 1: |
| |
| CHK_COUNT 80,6 |
| ldr r0,[sp,#8] @ get block count off the stack |
| #if HARDENING |
| @ We check the random counts here. Note we start with the combined count and subtract, just because |
| @ it might make it marginally more difficult to get the right answer if skipping multiple instructions |
| movs r1, #(RND_COUNT_remap + RND_COUNT_refreshchaff_and_lfsr) |
| #if REMAP_PERIOD != 1 |
| tst r0, #(REMAP_PERIOD-1) |
| it ne |
| subne r1, #RND_COUNT_remap |
| #endif |
| #if REFCHAFF_PERIOD != 1 |
| tst r0, #(REFCHAFF_PERIOD-1) |
| it ne |
| subne r1, #RND_COUNT_refreshchaff_and_lfsr |
| #endif |
| @ r0=block count, r1=expected sha rand count, r3=block count |
| rcp_iequal_nodelay r0, r3 |
| @ r1=expected sha rand count, r3=block count |
| check_rnd_count_dynamic |
| #endif // HARDENING |
| @ r3=block count |
| |
| @ No point in having a branch if we should never take it (hardening/size fail) |
| #if REFROUNDKEYSHARES_PERIOD != 1 |
| #if HARDENING |
| // we want to check that we are calling enough |
| #warning REFROUNDKEYSHARES_PERIOD check needs hardening |
| #endif |
| tst r3,#(REFROUNDKEYSHARES_PERIOD-1) |
| bne skip_ref_roundkey_shares_s |
| #endif |
| #if INLINE_REF_ROUNDKEY_SHARES_S |
| inline_ref_roundkey_shares_s |
| #else |
| #if HARDENING |
| // todo graham we could remove this for space, as I don't think r4 and r5 are equal |
| @ Make sure r4 != r5 on entry to ref_roundkey_shares_s |
| subs r4, r5, #1 |
| #endif |
| bl ref_roundkey_shares_s @ refresh the round key shares |
| #if HARDENING |
| @ r4 and r5 are set equal by ref_roundkey_shares (note we don't do a rnd_check as no sha random numbers are generated) |
| rcp_iequal_nodelay r4, r5 |
| #endif |
| #endif |
| skip_ref_roundkey_shares_s: |
| |
| #if REFROUNDKEYHVPERMS_PERIOD != 1 |
| #if HARDENING |
| // we want to check that we are calling enough |
| #warning REFROUNDKEYHVPERMS_PERIOD check needs hardening |
| #endif |
| ldr r3,[sp,#8] @ get block count off the stack |
| tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1) |
| bne skip_ref_roundkey_hvperm_s |
| #endif |
| #if INLINE_REF_ROUNDKEY_HVPERMS_S |
| inline_ref_roundkey_hvperms_s |
| #else |
| bl ref_roundkey_hvperms_s @ refresh the round key vperms |
| #if HARDENING |
| movs r0, #30 |
| @ r7 should be 30 on exit from ref_roundkey_hvperms_s |
| rcp_iequal_nodelay r0, r7 |
| #endif |
| #endif |
| skip_ref_roundkey_hvperms_s: |
| |
| CHK_COUNT 81,6 |
| |
| @ Trashes r0, r1 |
| reset_rnd_count |
| pop {r1-r3} |
| @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter |
| |
| @ Now calculate r12 = block number-to-be-deciphered from r3 = block counter |
| .if CT_BPERM |
| @ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7 |
| push {r1} |
| ldr r0,=murmur3_constants |
| ldmia r0,{r9-r12,r14} @ load five murmur3_32 hash constants |
| ldr r0,=bperm_rand |
| movs r1,#31 |
| movs r4,r3 @ r4=i |
| 1: |
| ldrh r5,[r0],#2 @ r5=k |
| subs r5,r5,r4 @ r5=k-i |
| ands r6,r2,r5,asr#31 @ r6=n*(k-i<0) |
| adds r5,r5,r6 @ r5=j=(k-i)%n |
| adds r6,r4,r5 @ r6=i+j |
| subs r7,r4,r5 @ r7=i-j |
| and r8,r7,r7,asr#31 @ r8=min(i-j,0) |
| sub r7,r7,r8,lsl#1 @ r7=|i-j| |
| mla r6,r6,r2,r7 @ r6=n(i+j)+|i-j|, encodes the unordered pair {i,j} |
| eors r6,r6,r1,lsl#27 @ mix with swap-or-not round counter to get different hash functions |
| @ Now do murmur3_32 hash of r6 |
| mul r6,r6,r9 |
| movs r6,r6,ror#17 |
| mul r6,r6,r10 |
| movs r6,r6,ror#19 |
| adds r6,r6,r6,lsl#2 |
| add r6,r6,r11 |
| eors r6,r6,#4 |
| eors r6,r6,r6,lsr#16 |
| mul r6,r6,r12 |
| eors r6,r6,r6,lsr#13 |
| mul r6,r6,r14 |
| eors r6,r6,r6,lsr#16 @ not actually used here |
| @ Now set i to j, conditional on the top bit of r6 |
| subs r7,r5,r4 @ r7=j-i |
| ands r7,r7,r6,asr#31 @ r7=(j-i)*(top bit of r6) |
| adds r4,r4,r7 @ r4=j if top bit of r6, else i |
| subs r1,r1,#1 |
| bpl 1b |
| // tooo loop check |
| pop {r1} |
| mov r12,r4 |
| .else |
| mov r12,r3 |
| .endif |
| CHK_COUNT 82,6 |
| |
| @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered) |
| push {r1-r3,r12} |
| @ r4-r11 = IV0, r12=block number |
| |
| processIV: @ non-target label to assist power analysis |
| ldr r8,=IV0 |
| ldmia r8,{r4-r7} @ load IV0_A |
| clear03 16 |
| add r8,r8,#20 |
| ldmia r8,{r8-r11} @ load IV0_B |
| clear03 32 |
| rev r0,r12 |
| eor r7,r7,r0 @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n. |
| @ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n) |
| @ r4-r11 = IV for the current block |
| CHK_COUNT 83,6 |
| .if ST_SHAREC |
| bl gen_rand_sha_nonpres @ Create state share C; all bytes the same |
| ands r0,r0,#255 |
| orrs r0,r0,r0,lsl#8 |
| orrs r12,r0,r0,lsl#16 |
| ldr r1,=shareC |
| str r12,[r1] |
| .else |
| movs r12,#0 |
| .endif |
| @ r4-r11 = IV for the current block w/o shareC, r12=shareC |
| @ refresh state shares and mix in shareC |
| bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc |
| bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16 |
| bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16 |
| bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16 |
| .if ST_VPERM |
| bl gen_rand_sha_nonpres |
| ldr r1,=statevperm |
| movs r2,#0 |
| str r2,[r1] |
| bl addstatevperm @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG) |
| #if HARDENING |
| // r1 is set to lut_b by addstatevperm |
| ldr r0, =shareB + 0x10 |
| rcp_iequal_nodelay r0, r1 |
| #endif |
| .endif |
| |
| @ Trashes r0, r1 |
| check_rnd_count RND_COUNT_ctr_crypt_mainloop_A |
| CHK_COUNT 84,6 |
| .if ST_SHAREC @ Avoid func call if the func is empty |
| bl conjshareC @ Add the effect of shareC to lut_a, lut_b |
| #if HARDENING |
| // r1 is set to lut_b by conjshare |
| ldr r2,=lut_b |
| rcp_iequal_nodelay r1, r2 |
| #endif |
| .endif |
| // todo graham remove this count |
| CHK_COUNT 85,6 |
| @ now perform the 15 encryption rounds on (key, state=IV+x) |
| @ here r4-r7, r8-r11: state |
| mov r2,#0 @ round counter |
| rounds_s_mainloop: |
| @ Trashes r0, r1 |
| reset_rnd_count_checked |
| ldr r12,=rkey_s |
| add r12,r12,r2,lsl#5 @ pointer to key shares for this round |
| add r12,r12,r2,lsl#3 |
| push {r2} @ save round count |
| bl addrkey_s |
| .if INLINE_MAP_SBOX_S |
| inline_map_sbox_s |
| .else |
| bl map_sbox_s |
| .endif |
| .if INLINE_SHIFT_ROWS_S |
| inline_shift_rows_s |
| .else |
| bl shift_rows_s |
| .endif |
| .if ST_VPERM |
| ldr r2,[sp] @ peek at stack to get round count |
| cmp r2,#NUMREFSTATEVPERM |
| bcs 1f |
| bl gen_rand_lfsr_nonpres |
| ldr r1,=statevperm |
| bl addstatevperm @ V shuffle of r4-r11 |
| #if HARDENING |
| // r1 is set to lut_b by addstatevperm |
| ldr r2, =shareB + 0x10 |
| rcp_iequal_nodelay r1, r2 |
| #endif |
| 1: |
| .endif |
| pop {r2} |
| adds r2,r2,#1 @ increment round counter |
| cmp r2,#14 |
| beq 2f @ break from loop? (last round has no mix_cols) |
| push {r2} |
| bl mix_cols_s |
| pop {r2} |
| b rounds_s_mainloop |
| 2: |
| #if HARDENING |
| movs r1, #14 |
| rcp_iequal_nodelay r1, r2 |
| #endif |
| CHK_COUNT 86,6 |
| ldr r12,=rkey_s+14*40 @ final round key shares |
| // todo graham check this is called |
| bl addrkey_s |
| CHK_COUNT 87,6 |
| .if ST_SHAREC @ Avoid func call if the func is empty |
| // todo alex, i assume that skipping this will cause bad things to happen anyway? |
| bl conjshareC @ Undo the effect of shareC from lut_a, lut_b |
| .endif |
| CHK_COUNT 88,6 |
| .if ST_VPERM |
| @ Undo the effects of vperm rotation recorded in statevperm |
| ldr r1,=statevperm |
| ldr r2,[r1] |
| rsbs r0,r2,#0 |
| @ We don't check this is called since failing to undo this is probably going to break decryption |
| // todo alex is this fair? |
| bl addstatevperm |
| .endif |
| |
| pop {r1-r3,r12} |
| push {r1,r3} |
| @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered |
| |
| decryption_start: |
| @ Decrypt ciphertext using AES output in shares: r4-r11 |
| .if ST_SHAREC |
| ldr r0,=shareC |
| ldr r0,[r0] |
| .else |
| movs r0,#0 |
| .endif |
| ldr r14,=chaff |
| @ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff |
| CHK_COUNT 89,6 |
| add r1,r1,r12,lsl#4 @ Temporarily r1 points to block-to-be-deciphered |
| ldr r3,[r1] @ r3=ciphertext word |
| eors r3,r3,r4 @ r3=r3^shareA |
| ldr r4,[r14] @ barrier load |
| eor r3,r3,r8,ror#16 @ r3=r3^shareB |
| eors r3,r3,r0 @ r3=r3^shareC |
| str r3,[r1] @ plaintext word=r3 |
| ldr r3,[r1,#4] @ and similarly for words 1,2,3 of block... |
| ldr r4,[r14,#4] |
| eors r3,r3,r5 |
| eor r3,r3,r9,ror#16 |
| eors r3,r3,r0 |
| str r3,[r1,#4] |
| ldr r3,[r1,#8] |
| ldr r4,[r14,#8] |
| eors r3,r3,r6 |
| eor r3,r3,r10,ror#16 |
| eors r3,r3,r0 |
| str r3,[r1,#8] |
| ldr r3,[r1,#12] |
| ldr r4,[r14,#12] |
| eors r3,r3,r7 |
| eor r3,r3,r11,ror#16 |
| eors r3,r3,r0 |
| str r3,[r1,#12] |
| |
| CHK_COUNT 90,6 |
| |
| @ Trashes r0, r1 |
| check_rnd_count RND_COUNT_decryption_end |
| |
| pop {r1,r3} @ Restore r1 to point to start of buffer |
| @ Restore block counter |
| @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter |
| decryption_end: |
| |
| adds r3,r3,#1 |
| cmp r3,r2 |
| CHK_COUNT 91,6 |
| bne ctr_crypt_mainloop |
| // todo alex, is this necessary - if you don't do the right number of loops, you ain't gonna get far? |
| #if HARDENING |
| rcp_iequal_nodelay r2, r3 |
| #endif |
| |
| #if WIPE_MEMORY |
| @ Wipe memory from workspace_start up to the stack pointer |
| @ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals |
| ldr r4,=workspace_start |
| add r5, r4, #rstate_all_start - workspace_start |
| #if HARDENING |
| ldr r7,=workspace_start |
| add r6, r4, #rstate_all_start - workspace_start |
| rcp_iequal_nodelay r4, r7 |
| #endif |
| #if HARDENING |
| // todo alex, is this necessary - if you don't do the right number of loops, you ain't gonna get far? |
| @ Recheck of above |
| rcp_iequal_nodelay r3, r2 |
| #endif |
| 1: |
| bl gen_rand_sha_nonpres |
| stmia r4!,{r0} |
| cmp r4,r5 |
| bcc 1b |
| #if HARDENING |
| rcp_iequal_nodelay r4, r6 |
| mov r6,sp |
| #endif |
| // not if this load is skpped, then we are just erasing from where we left off before |
| .if rstate_all_end <= rstate_all_start |
| .err |
| .endif |
| ldr r4,=rstate_all_end |
| mov r5,sp @ gcc arm assembler says cmp r4,sp is deprecated, so use another register |
| 1: |
| bl gen_rand_sha_nonpres |
| stmia r4!,{r0} |
| cmp r4,r5 |
| bcc 1b |
| #if HARDENING |
| rcp_iequal_nodelay r4, r6 |
| #endif |
| |
| @ Then fill everything with zeros so as not to leave behind clues about the RNG state |
| ldr r4,=workspace_start |
| movs r0,#0 |
| mov r5,sp |
| 1: |
| stmia r4!,{r0} |
| cmp r4,r5 |
| bcc 1b |
| #if HARDENING |
| rcp_iequal_nodelay r4, r6 |
| #endif |
| #endif |
| |
| .if GEN_RAND_SHA |
| SET_COUNT 23,6 |
| bl reset_sha_trng @ clear out the SHA hardware |
| .endif |
| pop {r0-r12,r14} |
| CHK_CANARY r12,CTAG0,6 |
| bx r14 |