enc_bootloader/aes.S - third_party/github/raspberrypi/picotool - Git at Google

 /* MEMORY LAYOUT ASSUMPTIONS

 The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see
 the macro getchaffaddress.

 The stack must be located at the end of Y scratch RAM: see the memory
 wiping at the end of ctr_crypt_s where memory between the start of Y
 scratch RAM and the stack pointer is overwritten.
 */

 .syntax unified
 .cpu cortex-m33
 .thumb

 #include "config.h"
 #include "hardware/platform_defs.h"
 #include "hardware/regs/addressmap.h"
 #include "hardware/regs/clocks.h"
 #include "hardware/regs/sha256.h"
 #include "hardware/regs/resets.h"
 #include "hardware/regs/rosc.h"
 #include "hardware/regs/trng.h"
 #include "hardware/rcp.h"

 #if HARDENING
 @                                             Number of calls to gen_rand_sha[_nonpres]
 #define RND_COUNT_decrypt                     394                           // From decrypt up to call to ctr_crypt_s
 #define RND_COUNT_ctr_crypt_s_init            (17 + 32 * CT_BPERM)          // Init phase of ctr_crypt_s
 #define RND_COUNT_ctr_crypt_mainloop_A        (4 + ST_VPERM + ST_SHAREC)
 #define RND_COUNT_refreshchaff_and_lfsr       2
 #define RND_COUNT_remap                       2
 #define RND_COUNT_decryption_end              3
 #endif

 .global decrypt
 .global chaff

 .extern lock_key

 @ RCP macros

 #define CTAG0  0x2a
 #define CTAG1  0x2b
 #define CTAG2  0x2c
 #define CTAG3  0x2d
 #define CTAG4  0x2e
 #define CTAG5  0x30
 #define CTAG6  0x31
 #define CTAG7  0x32
 #define CTAG8  0x33
 #define CTAG9  0x34
 #define CTAG10 0x35
 #define CTAG11 0x36
 #define CTAG12 0x37
 #define CTAG13 0x38
 #define CTAG14 0x39
 #define CTAG15 0x3a
 #define CTAG16 0x3b
 #define CTAG17 0x3c
 #define CTAG18 0x3d
 #define CTAG19 0x3e
 #define CTAG20 0x3f
 #define CTAG21 0x29

 @ number of blocks from the TRNG processed to initialise rstate_sha
 #define TRNG_BLOCKS 25

 @ if GEN_RAND_SHA==0 then we don't call the counting version
 #if HARDENING && GEN_RAND_SHA
 .macro check_rnd_count count
 .if !(\count & 0xffffff00)
 movs r1, #\count
 .else
 ldr r1, =\count
 .endif
 movs r0, #(\count & 1) ^ 1
 bl check_rnd_count_func
 rcp_iequal_nodelay r1, r0
 #if DOUBLE_HARDENING
 rcp_iequal_nodelay r0, r1
 #endif
 .endm

 @ r1 has the expected count
 @ Trashes r0
 .macro check_rnd_count_dynamic
 mov r0, sp
 bl check_rnd_count_func
 rcp_iequal_nodelay r1, r0
 #if DOUBLE_HARDENING
 rcp_iequal_nodelay r0, r1
 #endif
 .endm

 .macro reset_rnd_count
 bl reset_rnd_count_func
 .endm

 .macro reset_rnd_count_checked
 @ This version verifies that the count was actually reset
 uxtb r0, r1
 bl reset_rnd_count_func
 ldr r0, [r0]
 bics r1, #0xff00ff
 rcp_iequal_nodelay r1, r0
 .endm

 #else
 .macro check_rnd_count count
 .endm
 .macro reset_rnd_count
 .endm
 .macro reset_rnd_count_checked
 .endm
 #endif
 @ The lower jitterpriorty is, the more the jitter
 .macro SET_COUNT n,jitterpriority
 .if RC_COUNT
 .if RC_JITTER > \jitterpriority
  rcp_count_set \n
 .else
  rcp_count_set_nodelay \n
 .endif
 .endif
 .endm

 .macro CHK_COUNT n,jitterpriority
 .if RC_COUNT
 .if RC_JITTER > \jitterpriority
  rcp_count_check \n
 .else
  rcp_count_check_nodelay \n
 .endif
 .endif
 .endm

 .macro GET_CANARY rx,tag,jitterpriority
 .if RC_CANARY
 .if RC_JITTER > \jitterpriority
  rcp_canary_get \rx,\tag
 .else
  rcp_canary_get_nodelay \rx,\tag
 .endif
 .endif
 .endm

 .macro CHK_CANARY rx,tag,jitterpriority
 .if RC_CANARY
 .if RC_JITTER > \jitterpriority
  rcp_canary_check \rx,\tag
 .else
  rcp_canary_check_nodelay \rx,\tag
 .endif
 .endif
 .endm

 @ Clear internal stripe load registers, and r0-r3
 @ 0 <= offset <= 32
 .macro clear03 offset=0
  getchaffaddress r0,\offset
  ldmia r0,{r0-r3}
 .endm

 .macro clear03_preserve_r3 offset=0
  getchaffaddress r0,\offset
  ldmia r0!,{r1-r2}
  ldmia r0!,{r1-r2}
 .endm

 .macro clear01 offset=0
  getchaffaddress r0,\offset
  ldmia r0,{r0,r1}
 .endm

 @ Put workspace in the second scratch area
 @ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants,
 @ otherwise they may end up silently replaced with 0 or 0xffffffff
 .section .scratch_y.aes,"aw",%progbits

 workspace_start:

 @ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress
 @ We need to set the chaff address directly with MOVs, rather than setting it with a load as normal, because at the point
 @ the macro is called we have just done a load of a sensitive value at a known memory offset mod 16, and the idea is that
 @ the next load is going to be of a random number (in the "chaff" memory) at that same offset mod 16, so we can't afford
 @ to do a ldr \rx, =0x20081000 + \offset first, as this will load a non-random value from an uncontrolled memory location mod 16.
 @ Ideally we'd avoid the magic number 0x2008100 by using, ADR \rx, chaff+\offset, but the linker does not support this.
 .macro getchaffaddress rx,offset=0
  mov \rx,#(0x1000+\offset)
  movt \rx,#0x2008
 .endm
 chaff:
 .space 48

 .balign 16
 rkey_s:                      @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words
                              @ see comment at init_key_4way for description of layout and meaning of rkey_s
 .space 600
 rkey4way:                    @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space
 .space 128
 .if CT_BPERM
 bperm_rand:                  @ 32 half words that define the oblivious permutation of blocks
 .space 64
 .endif

 .balign 16
 permscratch:                 @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s)
 perm16:
 .space 16
 @ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s
 .balign 16
 fourway:                     @ Must be 0 mod 16
 shareA:                      @ 0 mod 16
 .space 20                    @ Only need 16 bytes, but choosing shareB!=shareA mod 16
 shareB:                      @ 4 mod 16
 .space 20
 shareC:                      @ 8 mod 16
 .space 4
 statevperm:                  @ 12 mod 16
 .space 4                     @ vperm state rotation: only last two bits are operational; other bits random
 RKshareC:                    @ Round key common share C; see comment at init_key_4way for explanation
 .space 4
 RKshareCchange:              @ Temporary used by ref_roundkey_shares_s
 .space 4
 IV0:                         @ 2-way share of IV for block 0
 .space 36                    @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16)
                              @ The gap at IV0[4] is to defeat unsharing by internal striped memory registers
                              @ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless

 @ Regardless of configuration, the code uses a single 256-entry LUT,
 @ which is a simple S-box table.
 @ The LUT is represented as two shares, lut_a and lut_b,
 @ whose values must be EORed. Furthermore, the contents of each share are
 @ scambled according to a 4-byte "map". The map comprises two bytes that
 @ are EORed into the addressing of the share, and two bytes that are
 @ EORed into the data read back from the share. Performing a lookup
 @ of a value x involves computing
 @ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ c₁ ^ lut_b[x ^ b₀ ^ b₁] ^ d₀ ^ d₁
 @ where a₀, a₁, c₀ and c₁ are the "map" of the lut_a share and
 @ b₀, b₁, d₀ and d₁ are the "map" of the lut_b share.
 @ In practice the result of a lookup is itself represented in two
 @ shares, namely
 @ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀  and
 @ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁
 .balign 16
 lut_a:                       @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup)
 .byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
 .byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
 .byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
 .byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
 .byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
 .byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
 .byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
 .byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
 .byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
 .byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
 .byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
 .byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
 .byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
 .byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
 .byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
 .byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
 lut_a_map:                   @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b
 .space 4
 .space 4                     @ align to 8 mod 16
 lut_b:                       @ LUT share B (must be 8 mod 16 so that init_key_sbox knows how to mask the lookup)
 .space 256
 lut_b_map:
 .space 4
 .space 4                     @ align to multiple of 8

 .balign 16
 rstate_all_start:            @ Mark start of RNG data to allow selective memory wipe
 rstate_sha:                  @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero
 .space 16
 jstate:                      @ 32-bit jitter state
 .space 4
 rstate_lfsr:                 @ 32-bit LFSR random state and constant used to step it
 .space 4
 .word 0x1d872b41             @ constant that defines a maximal-length LFSR
 rstate_count:
 .space 4
 rstate_all_end:              @ Mark end of RNG data to allow selective memory wipe

 .if CT_BPERM
 .balign 16
 murmur3_constants:           @ Five constants used in murmur3_32 hash
 .word 0xcc9e2d51
 .word 0x1b873593
 .word 0xe6546b64
 .word 0x85ebca6b
 .word 0xc2b2ae35
 .endif

 scratch_y_end:

 @ Initialisation code in main .text section
 .section .text,"ax",%progbits

 @ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments.
 @ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some
 @ random numbers.
 @ Trashes r0-r6
 .balign 4
 init_rstate:
  CHK_COUNT 24,6
  ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET
  ldr r5,=SHA256_BASE
  movs r1,#1
  str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]
  ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]     @ reads as 0
  movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS                     @ initialise SHA internal state by writing START bit
  str r1,[r5,#SHA256_CSR_OFFSET]
  str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET       -TRNG_RNG_IMR_OFFSET]
 #if HARDENING
  movs r3, #0
 #endif
  movs r6,#TRNG_BLOCKS*2+1                                            @ odd so that we break out of the loop half-way through loading the SHA hardware, giving
                                                                      @ time for previous SHA computation to complete
 2:
  movs r1,#0xff                                                       @ TRNG setup is inside loop in case it is skipped.
  str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET]     @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples
  str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]     @ start ROSC if it is not already started
  str r1,[r4,#TRNG_RNG_ICR_OFFSET           -TRNG_RNG_IMR_OFFSET]     @ clear all interrupts (including EHR_VLD)
  adds r0,r4,#TRNG_EHR_DATA0_OFFSET         -TRNG_RNG_IMR_OFFSET
 1:
  ldr r1,[r4,#TRNG_TRNG_BUSY_OFFSET         -TRNG_RNG_IMR_OFFSET]                                                      @ wait for 192 ROSC samples to fill EHR,should take constant time
  cmp r1,#0
  bne 1b
  subs r6,#1                                                          @ done?
  beq 3f
  movs r1,#8
 1:
  ldmia r0!,{r2}                                                      @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1)
  str r2,[r5,#SHA256_WDATA_OFFSET]                                    @ for a total of half a SHA-256 block
 #if HARDENING
  adds r3,#1
 #endif
  subs r1,#1
  bne 1b
 #if HARDENING
  ldr r1, =TRNG_BASE+TRNG_EHR_DATA0_OFFSET+32
  rcp_iequal_nodelay r0, r1
 #endif
  ldr r2,[r5,#SHA256_SUM0_OFFSET]                                     @ TRNG is now sampling again; use some SHA bits to modulate the chain length
  str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]
  b.n 2b

 3:
 #if HARDENING
  movs r2, #(TRNG_BLOCKS*2) * 8
  rcp_iequal_nodelay r2, r3
 #endif
 #if HARDENING
 @ good test that we are dealing with real hardware
  ldr r2,[r5,#SHA256_CSR_OFFSET]
  movw r1,#SHA256_CSR_RESET
  rcp_iequal_nodelay r1, r2
  rcp_iequal_nodelay r2, r1
 #endif
  CHK_COUNT 25,6
  str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]     @ turn off rand source and wipe SHA bits left in TRNG config; r1=0
  str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]
  adds r5,r5,#SHA256_SUM0_OFFSET
 @ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc)
  ldmia r5,{r0-r3}  @ load first 4 words of the 8 word SHA256 output
  ldr r6,=rstate_sha
 @ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc), r6=rstate_sha
  stmia r6,{r0-r3}
  CHK_COUNT 26,6
  movs r0,#0
 #if !HARDENING
  strb r0,[r6]      @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data"
 #else
  str  r0,[r6]      @ make sure rstate_sha[0] has word 0 set to 0, representing "out of data" (24-31) and 0 numbers generated (0-23)
 #endif

 @ try to find a non-zero initialiser to create a non-degenerate LFSR random state
  ldr r1,[r5,#16]   @ SHA SUM4
  cbnz r1,1f        @ is word 4 non-zero? then use it
  ldr r1,[r5,#20]   @ SHA SUM5
  cbnz r1,1f        @ otherwise, is word 5 non-zero? use it
  mov r1,r6         @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability)
 1:
  str r1,[r6,#rstate_lfsr-rstate_sha]

 @ try to find a non-zero initialiser to create a non-degenerate ROSC random state
  ldr r1,[r5,#24]   @ SHA SUM6
  cbnz r1,1f        @ is word 6 non-zero? then use it
  ldr r1,[r5,#28]   @ SHA SUM7
  cbnz r1,1f        @ otherwise, is word 7 non-zero? use it
  mov r1,r6         @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability)
 1:
  ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE
  str r1,[r2,#0]    @ Initialise ROSC LFSR
  CHK_COUNT 27,6
 #if HARDENING
  ldr r3,=ROSC_RANDOM_OFFSET+ROSC_BASE
  cbnz r1, 1f
  rcp_panic
 1:
  ldr r3, [r3]
  rcp_iequal_nodelay r1, r3
 #endif

 .if GEN_RAND_SHA
 .if SH_JITTER
  movs r2,#0
  str r2,[r6,#jstate-rstate_sha]
 .endif
 .endif

  CHK_COUNT 28,6
  bx r14

 .thumb_func
 decrypt:
 @ r0=4-way key, r1=IV_shareA, r2=IV_shareB, r3=message buffer, [sp]=number of blocks
  ldr r12,[sp]               @ Pop 5th argument in r12 (which we are allowed to treat as scratch according to AAPCS)
  push {r14}
  GET_CANARY r14,CTAG3,6
 #if !CALLER_INIT_RCP_COUNT
  SET_COUNT 23,6
 #endif
  push {r4-r11,r14}
  push {r0-r3,r12}            @ Save the five arguments
  bl reset_sha_trng
  bl init_rstate
 @ randomly re-share the LUT contents
  ldr r4,=lut_a
  mov r5,#64                  @ 64 words = 256 bytes
 1:
  bl gen_rand_sha_nonpres
  ldr r6,[r4,#lut_b-lut_a]    @ EOR a random word into both shares
  eors r6,r6,r0
 @if r0 is not EORed into only one share, then the LUT won't be right
  str r6,[r4,#lut_b-lut_a]
  ldr r6,[r4]
 #if HARDENING
  eors r7,r6,r0
  eors r8,r7,r6
  rcp_iequal_nodelay r8, r0
  stmia r4!,{r7}
 #else
  eors r6,r6,r0
  stmia r4!,{r6}
 #endif
  subs r5,r5,#1
  bne 1b
 #if HARDENING
  ldr r5,=lut_a + 256
  rcp_iequal_nodelay r4, r5
 #endif
  CHK_COUNT 29,6
 #if HARDENING
 @check again as this is quite important
  rcp_iequal_nodelay r5, r4
 #endif
  bl remap                    @ scramble the LUTs
  pop {r0}                    @ pointer to 4way key data
  bl init_key_4way
  // todo alex this may trash r12; is that ok?
  bl lock_key
  CHK_COUNT 32,6
  pop {r0-r3}                 @ r0=IV_shareA, r1=IV_shareB, r2=message, r3=num blocks
  bl ctr_crypt_s
  bl randomisechaff
  clear03
  pop {r4-r11,r14}
  CHK_CANARY r14,CTAG3,6
  pop {r15}

 .thumb_func
 reset_sha_trng:
  GET_CANARY r0,CTAG19,0
  ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET
  ldr r2,[r1]
  ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS
  orrs r2,r2,r3
  str r2,[r1]       @ reset the SHA hardware and the TRNG hardware
  CHK_COUNT 23,6
  bics r2,r2,r3
  str r2,[r1]       @ release the reset
  CHK_CANARY r0,CTAG19,0
  bx r14

 @ Put AES core code in first scratch area
 .section .scratch_x.aes,"ax",%progbits

 @ if GEN_RAND_SHA==0 then we don't call the counting version
 #if HARDENING && GEN_RAND_SHA
 check_rnd_count_func:
 @ NOTE: we don't bother with a canary here as we don't write anything
  ldr r0,=rstate_sha
  ldr r0, [r0]
  rsbs r0,r0,#0   @ Negate bottom 24 bits to get the number of calls to gen_rand_sha[_nonpres] since the last reset
  bfc r0,#24,#8   @
  bx r14

 reset_rnd_count_func:
  push {lr}
  GET_CANARY lr,CTAG11,0
  ldr r0,=rstate_sha
  ldrb r1, [r0, #3]
  orrs r1, #1
  lsls r1, #24
  str r1, [r0]
  CHK_CANARY lr,CTAG11,0
  pop {pc}
 #endif

 .if GEN_RAND_SHA
 @ we need SHA256_SUM0_OFFSET==8 (see note below)
 .if SHA256_SUM0_OFFSET!=8
 .err
 .endif

 @ Return single random word in r0
 @ Preserves r1-r13
 .balign 4
 gen_rand_sha:
  push {r1-r3,lr}
  GET_CANARY r1,CTAG1,2
  push {r1}
 .if SH_JITTER
  ldr r2,=rstate_sha
  ldr r0,[r2,#jstate-rstate_sha]
  lsls r3,r0,#30
  lsrs r3,#28
  movs r1,#1
  lsls r3,r1,r3           @ 1<<(4*(r0&3))
  udiv r3,r3,r1           @ Takes constant + (r0&3) cycles
  lsrs r0,r0,#2
  bne 1f
  bl gen_rand_sha_nonpres
  ldr r2,=rstate_sha
 #if HARDENING
  ldr r1,[r2]             @ Make this (SH_JITTER) not affect rnd_count
  adds r1,r1,#1           @ (compensating for call to gen_rand_sha_nonpres which decrements the count by 1)
  str r1,[r2]             @ The purpose is to simplify check_rnd_count calls, and to avoid having to reset jstate frequently
 #endif
 1:
  str r0,[r2,#jstate-rstate_sha]
 .endif
  bl gen_rand_sha_nonpres
  pop {r1}
  CHK_CANARY r1,CTAG1,0
  pop {r1-r3,pc}

 @ Return single random word in r0
 @ Trashes r1-r3
 .balign 4
 gen_rand_sha_nonpres:
  push {lr}
  GET_CANARY lr,CTAG18,0
  ldr r2,=rstate_sha
 #if !HARDENING
  ldr r3,=SHA256_BASE
  ldrb r1,[r2]                @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers)
  subs r0,r1,#4               @ decrement it to previous SUM register
  ble 1f                      @ if the offset was 4 or less we have run out of SUM register values
  strb r0,[r2]                @ save updated SUM register offset in bottom byte of rstate_sha[]
  ldr r0,[r3,r1]              @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8
 #else
  ldr r3,=SHA256_BASE
  ldr r1,[r2]                 @ get word counter (8) : rand counter (24) from first word of rstate_sha[] (offset into SUM registers)
  lsls r0, r1, #1             @ clear C (also set N which may force us down BLE path on skip of the sub below)
  sbcs r0,r1,#0x04000000      @ decrement word counter for previous SUM register (and decrement rand counter due to C == 0)
  str r0,[r2]                 @ save updated worder counter / rand_counter in bottom word of rstate_sha[]
  asrs r1, r0, #24
  ble 1f                      @ if the offset was 4 or less we have run out of SUM register values

  ldr r2,=SHA256_BASE + 4
  adds r2, r1
  adds r1, r3, r0, asr #24
  ldr r0, [r2], #-4
  rcp_iequal_nodelay r1, r2
 #endif
  b gen_rand_sha_nonpres_exit
 1:
 @ [CK_JITTER code was here]
  movs r0,#SHA256_SUM6_OFFSET+1
 #if !HARDENING
  strb r0,[r2]                @ reset word counter: the +1 is compensated for later
 #else
  strb r0,[r2,#3]             @ reset word counter: the +1 is compensated for later
 #endif
  movw r1,#(1<<SHA256_CSR_BSWAP_LSB)+(1<<SHA256_CSR_START_LSB)
  str r1,[r3,#SHA256_CSR_OFFSET]        @ start SHA256 hardware
  movs r0,#3                  @ take four words from rstate_sha, incrementing as we go
  ldr r1,[r2]
 #if !HARDENING
  adds r1,r1,#255             @ overall this adds 256 to the value in rstate_sha and resets the bottom byte to SHA256_SUM6_OFFSET
 #else
  adds r1,r1,#0xff000000      @ overall this adds 256 to the value in rstate_sha and resets the bottom byte to SHA256_SUM6_OFFSET
 #endif
 1:
  str r1,[r2],#4
  str r1,[r3,#SHA256_WDATA_OFFSET]
  cbz r0,3f
  ldr r1,[r2]
  adcs r1,r1,#0
  sub r0,r0,#1                @ preserve the carry
  b 1b
 3:
  movs r1,#0x80               @ End of message bit (with byte-swapped endianity) = start of message padding
  str r1,[r3,#SHA256_WDATA_OFFSET]
  movs r1,#9
 1:
  str r0,[r3,#SHA256_WDATA_OFFSET]
  subs r1,r1,#1
  bpl 1b

  lsls r2, r1, #31             @ Specifies message length = 128 bits (with byte-swapped endianity) (i.e. 0x80000000)
  str r2,[r3,#SHA256_WDATA_OFFSET]
 1:
  ldr r0,[r3,#SHA256_CSR_OFFSET]
 #if HARDENING
  asrs r2, #1
 #endif
  lsrs r0,r0,#SHA256_CSR_SUM_VLD_LSB+1
  bcc 1b                      @ wait for hardware to finish
 #if HARDENING
  @ r1 is -1 from loop above
  @ r2 is asr-ed right from 0x8000000. emperically it takes more than 6 loops, so we should have multiple 1s in the high bits
  @    note also that if 0x80000000 was not set above correctly, r2 might not be negative
  asrs r2, #26
  @ BEWARE this will fail if you step thru the above loop in the debugger as it will finish too quickly!
  rcp_iequal_nodelay r1, r2
 #endif
  ldr r0,[r3,#SHA256_SUM7_OFFSET]
 gen_rand_sha_nonpres_exit:
  CHK_CANARY lr,CTAG18,0
  pop {pc}
 .endif

 @ simple LFSR rand versions
 @ return a random number in r0
 @ This version preserves all r1-r13
 @ 23 or 24 cycles including branch = 23 or 24 cycles/word
 @ (would be 20 or 21 cycles if written out)
 .balign 4
 .thumb_func
 .if !GEN_RAND_SHA
 gen_rand_sha:
 gen_rand_lfsr:               @ Not used
  push {r14}
  GET_CANARY r14,CTAG2,2
  push {r1-r3,r14}
  bl gen_rand_lfsr_nonpres
  pop {r1,r3,r14}
  CHK_CANARY r14,CTAG2,0
  pop {r15}
 .endif

 @ Trashes r1,r2,r3
 @ 12 cycles including branch = 12 cycles/word
 .balign 4
 .if !GEN_RAND_SHA
 gen_rand_sha_nonpres:
 .endif
 gen_rand_lfsr_nonpres:
  GET_CANARY r3,CTAG10,0
  ldr r2,=rstate_lfsr
  ldmia r2,{r0-r1}           @ r0=state_in, r1=0x1d872b41=constant for a maximum-length sequence
  and r1,r1,r0,asr#31        @ will we be shifting out a 1? keep the constant, otherwise 0
  eors r0,r1,r0,lsl#1
 #if HARDENING
 @ Basically r3 &= (r0 ? 0xfffffffff : 0) but still potentially perturbing r3 unless the and is skipped
  clz r1, r0
  subs r1, #32
  asrs r1, #5
  ands r3, r1
 #endif
  str r0,[r2]
  CHK_CANARY r3,CTAG10,0
  bx r14

 .macro loadlfsr
  ldr r2,=rstate_lfsr
  ldmia r2,{r0-r1}           @ r0=lfsr_state, r1=lfsr_const=0x1d872b41 for a maximum-length sequence
 .endm

 .macro steplfsr
  ands r3,r1,r0,asr#31       @ will we be shifting out a 1? keep the constant, otherwise 0
  eors r0,r3,r0,lsl#1
 .endm

 .macro steplfsr_check
  steplfsr
  bne steplfsr_check\@
  rcp_panic
  steplfsr_check\@:
 .endm

 .macro savelfsr
  str r0,[r2]
 .endm

 .ltorg

 .balign 4
 .thumb_func
 makesmallperm:
 @ Make a uniformly random permutation of R0 bytes and stores the resulting byte array at R1
 @ Should be very uniform up to R0=10; maybe 11 or 12 are also OK. (10! << 2^32)
 @ To make it valid up to R0=256, move the bl gen_rand_sha inside the loop
 @ Uses inside-out method (slightly more efficient variant of Fisher-Yates)
 @ Trashes r0-r3

  push {r14}
  GET_CANARY r14,CTAG4,6
  push {r4-r6,r14}
  movs r4,r1
  movs r6,r0
  movs r1,#0
  movs r2,#1
  bl gen_rand_sha

 1:
 @ r1,r2=i,i+1,   i=0, 2, 4, ...
  cmp r1,r6
  beq 2f

  umull r0,r3,r0,r2
  ldrb r5,[r4,r3]
  strb r5,[r4,r1]
  strb r1,[r4,r3]
  adds r1,r1,#2

 @ r2,r1=i,i+1,   i=1, 3, 5, ...
  cmp r2,r6
  beq 2f

  umull r0,r3,r0,r1
  ldrb r5,[r4,r3]
  strb r5,[r4,r2]
  strb r2,[r4,r3]
  adds r2,r2,#2

  b 1b

 2:
  pop {r4-r6,r14}
  CHK_CANARY r14,CTAG4,6
  pop {r15}

 .balign 4
 .thumb_func
 makeperm16:
 @ Make a random permutation of 16 things using the inside-out method (slightly more efficient variant of Fisher-Yates)
 @ Store it in the 16 bytes at perm16
 @ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha
 @ Trashes r0-r5

  GET_CANARY r0,CTAG5,1
  push {r0,r14}
  ldr r4,=perm16
  bl gen_rand_sha_nonpres

 @ i=0
  movs r1,#0
  movs r2,#1       @ r1,r2=i,i+1
  strb r1,[r4]

 @ i=1
  adds r1,r1,#2    @ r1,r2=i+1,i
  umull r0,r3,r0,r1
  ldrb r5,[r4,r3]
  strb r5,[r4,r2]
  strb r2,[r4,r3]

 1:
 @ i=2, 4, 6, 8
  adds r2,r2,#2    @ r1,r2=i,i+1
  umull r0,r3,r0,r2
  ldrb r5,[r4,r3]
  strb r5,[r4,r1]
  strb r1,[r4,r3]

 @ i=3, 5, 7, 9
  adds r1,r1,#2    @ r1,r2=i+1,i
  umull r0,r3,r0,r1
  ldrb r5,[r4,r3]
  strb r5,[r4,r2]
  cmp r1,#10
  strb r2,[r4,r3]
  bne 1b

 @ refresh random number after extracting 10! from it
 @ 10! and 16!/10! are both much less than 2^32, so the permutation will be extremely close to uniform
  bl gen_rand_sha

 1:
 @ i=10, 12, 14
  adds r2,r2,#2    @ r1,r2=i,i+1
  umull r0,r3,r0,r2
  ldrb r5,[r4,r3]
  strb r5,[r4,r1]
  strb r1,[r4,r3]

 @ i=11, 13, 15
  adds r1,r1,#2    @ r1,r2=i+1,i
  umull r0,r3,r0,r1
  ldrb r5,[r4,r3]
  strb r5,[r4,r2]
  cmp r1,#16
  strb r2,[r4,r3]
  bne 1b

  pop {r0,r14}
  CHK_CANARY r0,CTAG5,4
  bx r14

 .balign 4
 .thumb_func
 remap:
 @ do a random remap of the LUTs
 @ preserves r0-r11; trashes r12
  GET_CANARY r12,CTAG6,6
  push {r0-r12,r14}
  bl gen_rand_sha_nonpres
  ldr r1,=lut_a
  bl remap_1
  bl gen_rand_sha_nonpres
  ldr r1,=lut_b
  bl remap_1
  pop {r0-r12,r14}
  CHK_CANARY r12,CTAG6,6
  bx r14

 remap_1:
 @ r0: B0:xa B1:xb B2:ya B3:yb
 @ r1: array of 256 bytes, followed by a 4-byte map
 @ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0
  GET_CANARY r6,CTAG7,6
  push {r6,r14}
  mov r14,0x01010101
  ubfx r6,r0,#16,#8
  ubfx r7,r0,#24,#8
  mul r6,r6,r14               @ data remaps ya and yb, byte replicated
  mul r7,r7,r14
  movw r10,#0x1010
  and r10,r10,r0,lsl#3        @ 0/16 in each byte of r10 from b1 and b9 of r0, ready for rotates by 0 or 16
  mov r3,#0x7f7f7f7f
  ubfx r2,r0,#0,#1
  lsl r11,r3,r2               @ 0x7f or 0xfe in each byte of r11, ready for sel of rev16
  ubfx r2,r0,#8,#1
  lsl r12,r3,r2
  ldr r2,[r1,#0x100]          @ old map
  eors r2,r2,r0
  str r2,[r1,#0x100]          @ updated map
  // todo graham; what is the effect of not doing the whole loop - is it broken if you just do some?
  mov r2,#252                 @ loop over entries
 1:
  ldr r4,[r1,r2]
  eor r3,r2,r0
  eor r3,r3,r0,ror#8
  and r3,r3,#0xfc             @ r3=remapped address r2
  ldr r5,[r1,r3]
  eors r5,r5,r6               @ remap data; ensure case x==0 works by doing both remaps on same side
  eors r5,r5,r7
  lsr r8,r10,#8
  ror r5,r5,r8                @ ROR#16 is the same as eor of address with 2
  ror r5,r5,r10
  rev16 r8,r5                 @ REV16 is the same as eor of address with 1
  uadd8 r9,r11,r11
  sel r5,r8,r5
  rev16 r8,r5
  uadd8 r9,r12,r12
  sel r5,r8,r5
  mul r8,r14,r2
  mul r9,r14,r3
  usub8 r8,r8,r9              @ bytewise comparison of original address and remapped address, both byte replicated
  sel r8,r4,r5                @ swap r4 and r5 as necessary in constant time
  str r8,[r1,r2]              @ write possibly swapped values back
  sel r8,r5,r4
  str r8,[r1,r3]
  subs r2,r2,#4
  bpl 1b
  pop {r6,r14}
  CHK_CANARY r6,CTAG7,6
  bx r14

 .if RK_ROR

 @ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC
 @ Trashes r0-r12
 @ If i = word number 0..3,
 @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
 @ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
 @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4])
 @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16

 .macro ref_roundkey_shares_s_impl
  ldr r4,=rkey_s
  loadlfsr
  steplfsr_check              @ r0=change in RKshareC
  ldr r2,=RKshareCchange
  str r0,[r2]
  ldr r3,=RKshareC
  ldr r5,[r3]
  eors r5,r5,r0
  str r5,[r3]
  @ r0=lfsr_state, r1=lfsr_const, r4=roundkey_ptr, r11=roundcounter

 ref_roundkey_shares_s_loop:
  ldmia r4!,{r5-r8,r10}       @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA

  ldr r12,[r4,#16]            @ r12 = X_B=vperm+rotations of rkey shareB
  mov r2,r12,lsr#30           @ r2 = vpermB
  sub r9,r2,r10,lsr#30        @ r9 = vpermB - vpermA (|junk)
  mov r2,r9,lsl#3             @ r2 = 8*(vpermB - vpermA) mod 32
  mov r12,r12,ror r2
  usub8 r12,r10,r12           @ r12 = rotsA - (rotsB ror r2)

  @ r2,r3,r10=workspace, r0=lfsr_state, r1=lfsr_const, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=rotsA, r11=roundcounter, r12=rotdiff
  steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
  steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
  steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
  steplfsr_check; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16;                    str r3,[r4,r9,lsl#2]

  ldr r3,=RKshareCchange
  ldr r3,[r3]
  movs r2,#0
  usub8 r10,r2,r10
  ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2
  ror r2,r3,r10; mov r10,r10,ror#8; eors r6,r6,r2
  ror r2,r3,r10; mov r10,r10,ror#8; eors r7,r7,r2
  ror r2,r3,r10;                    eors r8,r8,r2

  subs r4,r4,#20
  stmia r4,{r5-r8}
  adds r4,r4,#40
  subs r11,r11,#1

  bne ref_roundkey_shares_s_loop
 #if HARDENING
  ldr r5,=rkey_s + 40 * 15
  rcp_iequal_nodelay r4, r5
 #endif
  ldr r2,=rstate_lfsr         @ restore rstate_lfsr
  savelfsr                    @ Save lfsr_state
  clear03 24
 .endm

 .else // RK_ROR

 @ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC
 @ Trashes r0-r11
 .macro ref_roundkey_shares_s_impl
  ldr r4,=rkey_s
  loadlfsr
  steplfsr_check              @ r0=change in RKshareC
  ldr r3,=RKshareC
  ldr r5,[r3]
  eors r5,r5,r0
  str r5,[r3]
  mov r10,r0
 ref_roundkey_shares_s_loop:
  ldmia r4!,{r5-r9}           @ r5-r8 = rkey shareA with vperm r9

  @ clear03: would need to do this with, say r2,r3,r12 (reloading r2 later)

  ldr r3,[r4,#16]             @ rkey shareB has a vperm of r10>>30
  movs r3,r3,lsr#30
  sub r9,r3,r9,lsr#30         @ r9 = vperm_B - vperm_A (|junk)
  @ r3,r12=workspace, r0=lfsr_state, r1=lfsr_const, r2=rstate_lfsr, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=RKshareCchange, r11=roundcounter

  steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
  steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
  steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
  steplfsr_check; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]

  subs r4,r4,#20
  stmia r4,{r5-r8}
  adds r4,r4,#40
  subs r11,r11,#1

  @ clear03: would need to do this with, say r3,r5-r8

  bne ref_roundkey_shares_s_loop
  savelfsr
  clear03 24
 #if HARDENING
  ldr r5,=rkey_s + 40 * 15
  rcp_iequal_nodelay r4, r5
 #endif
 .endm
 .endif

 .if INLINE_REF_ROUNDKEY_SHARES_S
 .macro inline_ref_roundkey_shares_s
 ref_roundkey_shares_s_starts:
  mov r11,#15                 @ there are 15 expanded keys
  ref_roundkey_shares_s_impl
 ref_roundkey_shares_s_end:
 .endm
 .else
 .balign 4
 .thumb_func
 ref_roundkey_shares_s:
  mov r11,#15                 @ there are 15 expanded keys
 ref_roundkey_shares_s_test:  @ entry point for test code to do fewer than 15 rounds
  push {lr}
  GET_CANARY lr,CTAG8,6
  ref_roundkey_shares_s_impl
  CHK_CANARY lr,CTAG8,6
  pop {pc}
 .endif

 .if RK_ROR

 @ Rotates roundkey vperms and RK_ROR rotations by random amounts
 @ Trashes r0-r10
 @ If i = word number 0..3,
 @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
 @ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
 @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4])
 @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16
 .macro ref_roundkey_hvperms_s_impl
  ldr r10,=rkey_s
 ref_roundkey_hvperms_s_loop:
  bl gen_rand_lfsr_nonpres     @ r0=new vperm high|rotations
  ldmia r10,{r2-r5,r9}         @ r2-r5=roundkey share A/B, r9=old vperm high|rotations
  str r0,[r10,#16]
  mov r8,r0,lsr#30             @ r8=new vperm low
  sub r6,r8,r9,lsr#30          @ r6=(new vperm low)-(old vperm low) | junk
  mov r8,r6,lsl#3              @ r8=8*((new vperm low)-(old vperm low)) mod 32
  mov r0,r0,ror r8
  usub8 r0,r9,r0               @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations)
  movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
  movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
  movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
  movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2]
  adds r10,r10,#20
  adds r7,r7,#1
  cmp r7, #30
  bne ref_roundkey_hvperms_s_loop
  clear03 28
 .endm

 .else

 @ Rotates roundkey vperms by random amounts
 @ Trashes r0-r9
 .macro ref_roundkey_hvperms_s_impl
  bl gen_rand_lfsr_nonpres
  ldr r1,=rkey_s
 ref_roundkey_hvperms_s_loop:
  cmp r7,#15
  bne 2f
 @ Get a new random r0 after using 15 x 2 bits of the original one
 @ Note that the junk bits (2-31) in the vperms are not adjusted independently, but that's no big loss,
 @ and the gain is only calling gen_rand_lfsr twice instead of 30 times.
  push {r1}; bl gen_rand_lfsr_nonpres; pop {r1}
  2:
  ldmia r1,{r2-r5,r9}    @ roundkey share A/B=r2-r5, vperm=r9 (including junk bits)
  mov r8,r9,lsr#30       @ r8=old vperm (low)
  add r6,r9,r0           @ r6=new vperm (high) | new junk
  str r6,[r1,#16]
  rsb  r6,r8,r6,lsr#30   @ r6=(new vperm low)-(old vperm low) | junk bits
  ands r6,r6,#3; str r2,[r1,r6,lsl#2]; adds r6,r6,#1
  ands r6,r6,#3; str r3,[r1,r6,lsl#2]; adds r6,r6,#1
  ands r6,r6,#3; str r4,[r1,r6,lsl#2]; adds r6,r6,#1
  ands r6,r6,#3; str r5,[r1,r6,lsl#2]
  adds r1,r1,#20
  movs r0,r0,ror#2
  adds r7,r7,#1
  cmp r7, #30
  bne ref_roundkey_hvperms_s_loop
  clear03 28
 .endm
 .endif

 .if INLINE_REF_ROUNDKEY_HVPERMS_S
 .macro inline_ref_roundkey_hvperms_s
 ref_roundkey_hvperms_s_starts:
  movs r7,#0
  ref_roundkey_hvperms_s_impl
 ref_roundkey_hvperms_s_end:
 .endm
 .else
 .balign 4
 .thumb_func
 ref_roundkey_hvperms_s:
  movs r7,#0
 ref_roundkey_hvperms_s_test:  @ entry point for test code to do fewer than 15 rounds
  GET_CANARY r0,CTAG9,6
  push {r0, lr}
  ref_roundkey_hvperms_s_impl
  pop {r0}
  CHK_CANARY r0,CTAG9,6
  pop {pc}
 .endif

 .ltorg

 .if ST_VPERM
 .balign 4
 .thumb_func
 @ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount
 @ given in the bottom two bits of R0 and update the rotation recorded at statevperm.
 @ On entry R1 must point to statevperm.
 @ Trashes r0-r3,r12
 @ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ...
 @           r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ...
 @ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise.
 addstatevperm:
  push {r14}
  GET_CANARY r14,CTAG20,0
  ldr r2,[r1]
  adds r2,r2,r0
  str r2,[r1]

  ldr r1,=shareA
  ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1
  ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1
  ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1
  ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1
  ldmia r1,{r4-r7}

  getchaffaddress r12          @ Overwrite temporary storage with random numbers
  ldmia r12!,{r2,r3}
  stmia r1!,{r2,r3}
  ldmia r12!,{r2,r3}
  stmia r1!,{r2,r3}

  ldr r1,=shareB
  ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1
  ands r0,r0,#3; str r9, [r1,r0,lsl#2]; adds r0,r0,#1
  ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1
  ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1
  ldmia r1,{r8-r11}

  getchaffaddress r0,16        @ Overwrite temporary storage with random numbers
  ldmia r0!,{r2,r3}
  stmia r1!,{r2,r3}
  ldmia r0!,{r2,r3}
  stmia r1!,{r2,r3}

 addstatevperm_exit:           @ label exit point to be to able to specify to analysis code
  CHK_CANARY r14,CTAG20,0
  pop {pc}
 .endif

 @ Conjugate lut_a, lut_b with (state) shareC
 @ I.e., EOR the input and output with shareC.
 @ We need to pick one input for each share A and B, and one output for ONE of the shares A and B
 @ Arbitrarily choosing a0, b1 and d0
 .balign 4
 conjshareC:
  push {r14}
  GET_CANARY r14,CTAG21,0
 .if ST_SHAREC
  ldr r1,=shareA
  ldr r0,[r1, #shareC-shareA]   @ Get shareC as a word (all bytes the same)
  ldr r1,=lut_a                 @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs...
  ldr r2,[r1,#0x100]
  eors r2,r2,r0,lsr#24
  str r2,[r1,#0x100]
  movs r0,r0,lsr#16
  ldr r1,=lut_b                 @ ... (continued) Here we're EORing share C into a0, b1 and d0.
  ldr r2,[r1,#0x100]
  eors r2,r2,r0,lsl#8
  str r2,[r1,#0x100]
 .endif
  CHK_CANARY r14,CTAG21,0
  pop {pc}

 .macro shift_rows_s_impl
 @ First "rotate" the two most-significant bytes of the state by two registers
 @ Trashes r0-r3
 @ Slightly faster (but not shorter?) with ubfx/bfi
  eors r0,r4,r6               @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
  lsrs r0,r0,#16
  lsls r0,r0,#16
  eors r4,r4,r0
  eors r6,r6,r0
  eors r0,r5,r7               @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
  lsrs r0,r0,#16
  lsls r0,r0,#16
  eors r5,r5,r0
  eors r7,r7,r0
 @ next "rotate" the two odd-significance bytes of the state by one register
  eors r1,r7,r4               @ tb=state[3]^state[0]; tb&=0xff00ff00;
  ands r1,r1,#0xff00ff00
  eors r0,r4,r5               @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta;
  ands r0,r0,#0xff00ff00
  eors r4,r4,r0
  eors r0,r5,r6               @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta;
  ands r0,r0,#0xff00ff00
  eors r5,r5,r0
  eors r0,r6,r7               @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta;
  ands r0,r0,#0xff00ff00
  eors r6,r6,r0
  eors r7,r7,r1               @                                       state[3]^=tb;
 @ repeat for other share, conjugated by ror#16
  clear01                     @ barrier
  eors r0,r8,r10              @ ta=state[0]^state[2]; ta&=0x0000ffff; state[0]^=ta; state[2]^=ta;
  lsls r0,r0,#16
  lsrs r0,r0,#16
  eors r8,r8,r0
  eors r10,r10,r0
  eors r0,r9,r11              @ ta=state[1]^state[3]; ta&=0x0000ffff; state[1]^=ta; state[3]^=ta;
  lsls r0,r0,#16
  lsrs r0,r0,#16
  eors r9,r9,r0
  eors r11,r11,r0
  eors r1,r11,r8              @ tb=state[3]^state[0]; tb&=0xff00ff00;
  ands r1,r1,#0xff00ff00
  eors r0,r8,r9               @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta;
  ands r0,r0,#0xff00ff00
  eors r8,r8,r0
  eors r0,r9,r10              @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta;
  ands r0,r0,#0xff00ff00
  eors r9,r9,r0
  eors r0,r10,r11             @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta;
  ands r0,r0,#0xff00ff00
  eors r10,r10,r0

  eors r11,r11,r1             @                                       state[3]^=tb;

  clear01                     @ barrier
 .endm

 .if INLINE_SHIFT_ROWS_S
 .macro inline_shift_rows_s
 shift_rows_s_starts:
  shift_rows_s_impl
 shift_rows_s_end:
 .endm
 .else
 .balign 4
 .thumb_func
 @ Not going to use canaries here as it doesn't write anything - could be use to perturb register values, but not super worried about that yet
 shift_rows_s:
  shift_rows_s_impl
  bx r14
 .endif

 @ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1
 @ r0x00 is a register holding 0x00000000;  r0x1b is a register holding 0x1b1b1b1b
 .macro mixcol rx,rt,ru,r0x00,r0x1b
                              @ let rx=(a,b,c,d)
  uadd8 \rt,\rx,\rx           @ MSB of each byte into the GE flags
  sel \ru,\r0x1b,\r0x00       @ get bytewise correction for bytewise field multiplication by 2
  eors \rt,\rt,\ru            @ (2a,2b,2c,2d)

  eors \ru,\rt,\rx            @ (3a,3b,3c,3d)
  eors \rt,\rt,\rx,ror#24     @ (2a+b,2b+c,2c+d,2d+a)
  eors \rt,\rt,\rx,ror#16     @ (2a+b+c,2b+c+d,2c+d+a,2d+a+b)
  eors \rx,\rt,\ru,ror#8      @ (2a+b+c+3d,2b+c+d+3a,2c+d+a+3b,2d+a+b+3c)
 .endm

 @ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1
 .macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b
  uadd8 \rt,\rx,\rx           @ field multiplication by 2 as above
  sel \rw,\r0x1b,\r0x00
  eors \rt,\rt,\rw            @ 2x
  uadd8 \ru,\rt,\rt
  sel \rw,\r0x1b,\r0x00
  eors \ru,\ru,\rw            @ 4x
  uadd8 \rv,\ru,\ru
  sel \rw,\r0x1b,\r0x00
  eors \rv,\rv,\rw            @ 8x

  eors \rx,\rx,\rv            @ 9x
  eors \rw,\rx,\rt            @ 11x
  eors \rw,\rw,\rx,ror#16     @ 11x ^ 9x ROL #16
  eors \rx,\rx,\ru            @ 13x
  eors \rw,\rw,\rx,ror#8      @ 11x ^ 9x ROL #16 ^ 13x ROL #24
  eors \rt,\rt,\ru            @ 6x
  eors \rt,\rt,\rv            @ 14x
  eors \rx,\rt,\rw,ror#8      @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24
 .endm

 .balign 4
 .thumb_func
 @ Not going to use canaries here as it doesn't write anything - could be use to perturb register values, but not super worried about that yet
 @ Trashes r0-r3,r12
 mix_cols_s:
  mov r2,#0x00000000
  mov r3,#0x1b1b1b1b
  mixcol r4 ,r0,r1,r2,r3      @ apply mixcol to each state word
  mixcol r5 ,r0,r1,r2,r3
  mixcol r6 ,r0,r1,r2,r3
  mixcol r7 ,r0,r1,r2,r3
  ldr r12,=chaff
  ldmia r12!,{r0,r1}          @ overwrite sensitive shareA-related quantities r0,r1 with random numbers
  mixcol r8 ,r0,r1,r2,r3
  mixcol r9 ,r0,r1,r2,r3
  mixcol r10,r0,r1,r2,r3
  mixcol r11,r0,r1,r2,r3
  ldmia r12!,{r0,r1}          @ overwrite  sensitive shareB-related quantities r0,r1 with random numbers
  bx r14

 @ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups)
 .macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3
  ubfx \Rspare0,\Rtarg,#0,  #8
  ubfx \Rspare1,\Rtarg,#8,  #8
  ubfx \Rspare2,\Rtarg,#16, #8
  ubfx \Rspare3,\Rtarg,#24, #8

  ldrb \Rspare0,[\Rtable,\Rspare0]
  ldrb \Rspare1,[\Rtable,\Rspare1]
  ldrb \Rspare2,[\Rtable,\Rspare2]
  ldrb \Rspare3,[\Rtable,\Rspare3]
  orr \Rspare0,\Rspare0,\Rspare1,lsl#8
  orr \Rspare2,\Rspare2,\Rspare3,lsl#8
  orr \Rtarg,\Rspare0,\Rspare2,lsl#16
 .endm

 @ map all bytes of the state through the split LUT, lut_a and lut_b
 @ Trashes r0-r3,r12
 .macro map_sbox_s_impl
  ldr r0,=shareA                 @ Write out state share A to memory
 @ stmia r0,{r4-r7}              @ Used to do a STM
  getchaffaddress r1
  ldr r2,[r1]
  str r4,[r0]                    @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms,
  str r2,[r1]                    @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired
  str r5,[r0,#4]                 @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic.
  str r2,[r1]                    @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1.
  str r6,[r0,#8]                 @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but
  str r2,[r1]                    @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic.
  str r7,[r0,#12]
  str r2,[r1]

  ldr r0,=shareB                 @ Write out state share B to memory
  stmia r0,{r8-r11}              @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with

  bl makeperm16                  @ Rebuild random 16-way permutation. Maybe do this less frequently
 @ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation

  bl gen_rand_sha_nonpres
  mov r11,r0
  ldr r8,=lut_a
  ldr r9,=lut_b
  ldr r0,[r8,#0x100]             @ R0 = a0 | a1<<8 | c0<<16 | c1<<24   (lut_a_map)
  eors r3,r0,r0,lsr#8            @ R3 = a0^a1 | junk
  uxtb r10,r3
  ldr r1,[r9,#0x100]             @ R1 = b0 | b1<<8 | d0<<16 | d1<<24   (lut_b_map)
  eors r1,r0,r1
  eors r2,r1,r1,lsr#8
  movs r12,r1,lsr#16             @ R12 = c0^d0 | (c1^d1)<<8
  bfi r12,r2,#16,#8              @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16

  ldr r4,=perm16
  ldr r5,=shareA
  ldr r6,=shareB
  movs r1,#0;movs r2,#0;movs r3,#0
 @ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16
  movs r0,#15
 1:                              @ (Ordering instructions to minimise result delays)
  ldrb r1,[r4,r0]                @ r1 = perm[r0]
  mov  r11,r11,ror#11            @ Rotate random 32 bits to present a new low 8 bits
  eors r7,r1,#2                  @ r7 = perm[r0]^2
  ldrb r2,[r5,r1]                @ r2 = shareA[perm[r0]]
  eor  r11,r11,r2,ror#8          @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted)
  ldrb r3,[r6,r7]                @ r3 = shareB[perm[r0]^2]
  eor  r2,r2,r10                 @ r2 = shareA[perm[r0]]^a0^a1
  eors r2,r2,r3                  @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]
  ldrb r3,[r8,r2]                @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]
  eor  r2,r2,r12,lsr#16          @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]
  eor  r3,r3,r12                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8)
  eor  r3,r3,r11                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8)
  strb r3,[r5,r1]                @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand
  ldrb r3,[r9,r2]                @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]
  subs r0,r0,#1
  eor  r3,r3,r11                 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand
  eor  r3,r3,r12,lsr#8           @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8)
  strb r3,[r6,r7]                @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1
  bpl 1b
  clear03 8                      @ barrier

  ldmia r6,{r8-r11}              @ Read state share B back from memory
  clear03 12                     @ barrier
  getchaffaddress r0,16
  bfi r0,r5,#0,#4                @ match chaff pointer (r0) to share A location (R5) mod 16
  @ldmia r5,{r4-r7}               @ Read state share A back from memory
  @clear03 16                     @ barrier
  ldr r4,[r5]                    @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s
  ldr r1,[r0]
  ldr r6,[r5,#8]
  ldr r1,[r0,#8]
  ldr r7,[r5,#12]
  ldr r1,[r0,#12]
  ldr r5,[r5,#4]                 @ Do r5 last because it's the address register
  ldr r1,[r0,#4]

 @ Refresh state shares because luts only give imperfect share-by-value
 @ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent)
 @ loadlfsr
 @ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16              @ Barriers between each pair of eors to prevent implicit r4^r8 etc
 @ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16
 @ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16
 @ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16
 @ savelfsr
 .endm

 .if INLINE_MAP_SBOX_S
 .macro inline_map_sbox_s
 map_sbox_s_starts:
  // push {lr}
  map_sbox_s_impl
  // pop {lr}
 map_sbox_s_end:
 .endm
 .else
 .balign 4
 .thumb_func
 map_sbox_s:
  GET_CANARY r12,CTAG12,3
  push {r12,r14}

  map_sbox_s_impl

  pop {r12,r14}
  CHK_CANARY r12,CTAG12,5
  bx r14
 .endif

 .ltorg

 .balign 4
 .thumb_func
 randomisechaff:
 @ Randomise 48 bytes of chaff values (random load values)
 @ Uses 12 bytes of permscratch
 @ Trashes r0-3
  GET_CANARY r0,CTAG13,6
  push {r0,r14}
  movs r0,#12
  ldr r1,=permscratch
  bl makesmallperm           @ Store the random words in a random order to make 2nd order attacks harder
  movs r1,#11
 1:
  push {r1}
  bl gen_rand_sha_nonpres
  pop {r1}
  ldr r2,=permscratch
  ldrb r2,[r2,r1]
  getchaffaddress r3
  str r0,[r3,r2,lsl#2]
  subs r1,r1,#1
  bpl 1b
  pop {r0,r14}
  CHK_CANARY r0,CTAG13,6
  bx r14

 .balign 4
 refreshchaff_and_lfsr:
 @ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff
 @ Re-randomise LFSR with SHA
 @ Uses 12 bytes of permscratch
 @ Trashes r0-3,12
  GET_CANARY r0,CTAG14,6
  push {r0,r14}

 @ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence
  bl gen_rand_sha_nonpres
  ldr r1,=rstate_lfsr
  ldr r2,[r1]
 1:
  adds r2,r2,r0
 @ note that r2 should not be 0 on entry, so both
 @ r2 + r0, and r2 + r0 + r0 on the next loop should not both be 0
 @ if they are, we will loop
  beq 1b           @ Don't update LFSR state to 0
 #if HARDENING
  beq 1b
 #endif
  str r2,[r1]

 @ Choose a random order to update chaff words to make 2nd order attacks harder
  movs r0,#12
  ldr r1,=permscratch
  bl makesmallperm

  movs r1,#11
 1:
  push {r1}
  bl gen_rand_lfsr_nonpres
  pop {r1}
  ldr r2,=permscratch
  ldr r3,=chaff
  ldrb r2,[r2,r1]
  ldr r12,[r3,r2,lsl#2]
  add r0,r0,r12
  str r0,[r3,r2,lsl#2]
  subs r1,r1,#1
  bpl 1b
  pop {r0,r14}
  CHK_CANARY r0,CTAG14,6
  bx r14

 .balign 4
 .thumb_func
 @ Do sbox on the four bytes of the 4-way share r4-r7
 @ Trashes r0,r8-r12
 init_key_sbox:
  GET_CANARY r12,CTAG15,6
  push {r1-r3,r12,r14}
  bl gen_rand_sha_nonpres; mov r8,r0
  bl gen_rand_sha_nonpres; mov r9,r0
  bl gen_rand_sha_nonpres; mov r10,r0
  bl gen_rand_sha_nonpres; mov r11,r0
  ldr r0,=fourway                @ Write out 4-way share to memory
  stmia r0,{r8-r11}              @ Save random values first to obscure saving of state
  stmia r0,{r4-r7}
  movs r4,#0                     @ Clear r4-r7 so that they don't interact with makesmallperm
  movs r5,#0
  movs r6,#0
  movs r7,#0

  bl randomisechaff              @ Randomise block of memory mainly used for obscuring loads

  movs r0,#4
  ldr r1,=permscratch
  bl makesmallperm               @ Build random 4-way permutation determining order of bytes to be SBOXed
  ldr r1,=permscratch            @ Write out random addresses in advance to save two registers (reusing permscratch)
  ldr r4,[r1]
  ldr r0,=fourway
  uxtab r5,r0,r4
  uxtab r6,r0,r4,ror#8
  uxtab r7,r0,r4,ror#16
  uxtab r8,r0,r4,ror#24
  stmia r1,{r5-r8}               @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3]

  bl gen_rand_sha                @ Save some randomness for the resharing operation later
  movs r7,r0
  bl gen_rand_sha
  movs r8,r0

  ldr r2,=lut_a
  ldr r3,=lut_b
  ldr r0,[r2,#0x100]             @ R0 = a0 | a1<<8 | c0<<16 | c1<<24   (lut_a_map)
  eors r10,r0,r0,lsr#8
  uxtb r10,r10                   @ R10 = a0^a1
  ldr r1,[r3,#0x100]             @ R1 = b0 | b1<<8 | d0<<16 | d1<<24   (lut_b_map)
  eors r1,r0,r1
  eors r4,r1,r1,lsr#8
  uxtb r11,r4                    @ R11 = a0^a1^b0^b1
  eor r10,r10,r11,lsl#8          @ R10 = a0^a1 | (a0^a1^b0^b1)<<8
  movs r12,r1,ror#16             @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24

  ldr r1,=permscratch
  ldr r11,=chaff
 @ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk
 1:
  ands r5,r1,#12
  adds r5,r11,r5                 @ Align chaff address to r1
  ldr  r6,[r1],#4                @ r6 = fourway + perm[i] (i=0-3, loop iteration)
  ldr  r5,[r5]                   @ Random load to mask previous load

  ands r9,r6,#12
  add  r9,r11,r9                 @ r9 = chaff address aligned to (r6 bic 3) mod 16
  ldrb r4,[r6,#0]
  ldr  r14,[r9,#0]               @ Random load to mask previous load
  eor  r4,r4,r10
  eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31

  ldrb r5,[r6,#4]
  ldr  r14,[r9,#4]               @ Random load to mask previous load
  eors r4,r4,r5
  eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31

  ldrb r5,[r6,#8]
  ldr  r14,[r9,#8]               @ Random load to mask previous load
  eors r4,r4,r5
  eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31

  ldrb r5,[r6,#12]
  ldr  r14,[r9,#12]              @ Random load to mask previous load
  eors r4,r4,r5                  @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk
  eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31

  ands r14,r4,#255
  ldrb r5,[r2,r14]               @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]
  and  r14,r4,#15
  add  r14,r14,#32
  ldrb r14,[r11,r14]             @ Random load to mask previous load (r2 and r11 are both 0 mod 16)
  eors r5,r5,r12                 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24
 @ split r5 into two shares and store at [r6,#0] and [r6,#4]
  strb r7,[r6,#0]
  eors r5,r5,r7
  strb r5,[r6,#4]

  mov r5,r10,lsr#8               @ r5=a0^a1^b0^b1
  ldr  r14,[r11,#44]             @ Need to eor into a random destination register
  eors r14,r4,r5                 @ r14 = unsharedbyte[perm[i]]^b0^b1 | junk<<8
  and r14,r14,#255

  ldrb r5,[r3,r14]               @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]
  and  r14,r14,#15
  add  r4,r11,#24
  ldrb r14,[r4,r14]              @ Random load to mask previous load (r3==8 and r11==0 mod 16)
  eor  r5,r5,r12,ror#8           @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24
 @ split r5 into two shares and store at [r6,#8] and [r6,#12]
  strb r8,[r6,#8]
  eors r5,r5,r8
  strb r5,[r6,#12]

  movs r7,r7,ror#8
  movs r8,r8,ror#8

  tst r1,#12                     @ This does 4 loop iterations because permscratch is guaranteed to be 0 mod 16
  bne 1b

  ldr r0,=fourway
  ldmia r0,{r4-r7}               @ Load SBOXed values back into register r4-r7
  ldmia r11,{r8-r12,r14}         @ Random load to mask previous load and to obfuscate registers

  pop {r1-r3,r12,r14}
  CHK_CANARY r12,CTAG15,6
  bx r14

 .balign 4
 .thumb_func
 @ r1 = pointer to 4 x 4-way share (16 words); left unchanged
 @ r3 = rkey_s+40*roundkeynumber; advanced by 40
 @ Trashes r8-r12
 @ If i = word number 0..3,
 @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
 @ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
 @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4])
 @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16
 storeroundkey:
  GET_CANARY r8,CTAG16,6
  push {r2,r8,r14}

 @ eor two 4-way share components to make a component of a 2-way share
 @ Note that we load from 4-way share at a random address then convert to 2-way share and
 @ store at a fixed address, rather than the other way around, so that 2-way shares are obscured
 @ by vperm (we don't know which 2-way share is being processed at a particular point in time).
 @ And (if RK_ROR) we rotate first before EORing down to 2-way, so there is never an unrotated 2-way share

  bl gen_rand_sha             @ Get r0 = vperm for shareA of the round key
  str r0,[r3,#16]
  mov r8,r0,lsr#30
  rsb r8,r8,#0                @ r8=-vperm
 .if RK_ROR
  movs r2,#0
  usub8 r2,r2,r0              @ r2=-hperms
 .endif
  mov r9,#4
 1:
  and r8,r8,#3
  adds r0,r1,r8,lsl#4

  ldmia r0,{r10,r11}
 .if RK_ROR
  mov r10,r10,ror r2
  mov r11,r11,ror r2
  movs r2,r2,ror#8
 .endif
  eor r10,r10,r11
  str r10,[r3],#4
  add r8,r8,#1
  subs r9,r9,#1
  bne 1b

  adds r1,r1,#8
  adds r3,r3,#4               @ skip over vperm (already stored)

  bl gen_rand_sha             @ Get r0 = vperm for shareB of the round key
  str r0,[r3,#16]
  mov r8,r0,lsr#30
  rsb r8,r8,#0                @ r8=-vperm
 .if RK_ROR
  movs r2,#0
  usub8 r2,r2,r0              @ r2=-hperms
 .endif
  mov r9,#4
  ldr r12,=RKshareC
  ldr r12,[r12]
 1:
  and r8,r8,#3
  adds r0,r1,r8,lsl#4
  ldmia r0,{r10,r11}
  eor r10,r10,r12             @ Mix in RKshareC into round key shareB
 .if RK_ROR
  mov r10,r10,ror r2
  mov r11,r11,ror r2
  movs r2,r2,ror#8
 .endif
  mov r10,r10,ror#16
  mov r11,r11,ror#16
  eor r10,r10,r11
  str r10,[r3],#4
  add r8,r8,#1
  subs r9,r9,#1
  bne 1b

  subs r1,r1,#8               @ Restore r1 = (r1 on entry)
  adds r3,r3,#4               @ Set     r3 = (r3 on entry) + 40

  pop {r2,r8,r14}
  CHK_CANARY r8,CTAG16,6
  bx r14

 .balign 4
 .thumb_func
 init_key_4way:
 @ On entry, r0 points to 4-way shared raw key data (64 bytes, 64 byte gap for FIB workaround, then other 64 bytes)
 @ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7
 @ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K.
 @
 @ On exit, rkeys_s, a 40*15=600-byte region, is filled as follows.
 @ Each of the 15 round keys is represented as two 5-word regions rka[0..4] and rkb[0..4],
 @ each of which consists of 4 words of round key followed by a word encoding vperm and rotation (RK_ROR) information.
 @ In addition a common share word, RKshareC, is set randomly.
 @ For a given round, rk[i] = the i^th word of the actual round key is given by:
 @ vpermA=rka[4]>>30
 @ vpermB=rkb[4]>>30
 @ rka_unrot[i] = rka[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of rka[4])
 @ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16
 @ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC

  GET_CANARY r12,CTAG17,6
  push {r0-r12,r14}

 @ Transfer 4-way key into local workspace, rerandomising the shares
  mov r5,r0                   @ r5=4-way key input
  bl randomisechaff
  ldr r6,=rkey4way
  movs r7,#8
 1:
 #if FIB_WORKAROUND
  cmp r7,#4
  bne 2f
  adds r5,#64                @ Skip 64 byte gap for FIB workaround
 2:
 #endif
  ldmia r5!,{r1-r4}
  bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0
  bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0
  bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0
  stmia r6!,{r1-r4}
  subs r7,r7,#1
  bne 1b

 @ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for
 @ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys.
  bl gen_rand_sha_nonpres
  ldr r12,=RKshareC
  str r0,[r12]                @ Make RKshareC random word
  ldr r3,=rkey_s              @ r3=rkey_s
  ldr r1,=rkey4way            @ r1=rkey4way
  bl storeroundkey            @ Store round key 0 and advance r3 by 40
  adds r1,r1,#64
  bl storeroundkey            @ Store round key 1 and advance r3 by 40
  adds r1,r1,#48
  ldmia r1!,{r4-r7}           @ r4-r7 = 4-way share of previous round key word
                              @ r1=rkey4way+128 on entry to main loop
  movs r2,#0                  @ r2=word counter (0-51), offset from word 8

 @ Note that r1-r3 are not sensitive values, so it's safe to stack
 @ them and conditionally branch on them.

 @ rkey4way = 8 x 4 consecutive 4-way share words as cyclic buffer of
 @   Rounds 0,1     Rounds 2,3            Rounds 12,13       Round 14
 @   a0 b0 c0 d0 -> a8 b8 c8 d8 -> ... -> a48 b48 c48 d48 -> a56 b56 c56 d56
 @   a1 b1 c1 d1 -> a9 b9 c9 d9           a49 b49 c49 d49    a57 b57 c57 d57
 @   a2 b2 c2 d2    etc                   a50 b50 c50 d50    a58 b58 c58 d58
 @   a3 b3 c3 d3                          a51 b51 c51 d51    a59 b59 c59 d59
 @   a4 b4 c4 d4                          a52 b52 c52 d52    ===============
 @   a5 b5 c5 d5                          a53 b53 c53 d53
 @   a6 b6 c6 d6                          a54 b54 c54 d54
 @   a7 b7 c7 d7                          a55 b55 c55 d55

 init_key_expandloop:
 @ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8)
 @ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words)
 @ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4])
 @ r4-r7 = 4-way share of previous roundkey word

  tst r2,#7
  bne 1f
  subs r1,r1,#128             @ Every 8th word, reset cyclic buffer pointer and do ROTWORD
  movs r4,r4,ror#8
  movs r5,r5,ror#8
  movs r6,r6,ror#8
  movs r7,r7,ror#8
 1:

  tst r2,#3
  bne 1f
  bl init_key_sbox            @ Every 4th word, do SUBBYTES (sbox) on r4-r7
 1:

  tst r2,#7
  bne 1f
  movs r0,r2,lsr#3
  mov r8,#1
  movs r8,r8,lsl r0
  eors r4,r4,r8               @ Every 8th word, add in round constant
 1:

  ldmia r1,{r8-r11}           @ eor with key from two rounds ago and advance r1 by 16
  eors r4,r4,r8
  eors r5,r5,r9
  eors r6,r6,r10
  eors r7,r7,r11
  stmia r1!,{r4-r7}

  add r2,r2,#1
  tst r2,#3
  bne 1f
  subs r1,r1,#64
  bl storeroundkey            @ Store round key 1+r2/4 and advance r3 by 40
  adds r1,r1,#64
 1:

  cmp r2,#52
  bne init_key_expandloop

  CHK_COUNT 30,6
  pop {r0-r12,r14}
  CHK_CANARY r12,CTAG17,6
  bx r14

 .ltorg

 @ Add the round key shares pointed to by r12 into the state shares
 @ Trashes r0-r3
 .balign 4
 addrkey_s:

  ldr r0,=chaff               @ guaranteed 0 mod 16
 .if ST_VPERM
  ldr r3,=statevperm
  ldr r3,[r3]                 @ r3=vperm state rotation in bottom two bits
  ldr r2,[r0,#12]             @ barrier load
 .else
  movs r3,#0
 .endif
  bfi r0,r12,#0,#4            @ match chaff pointer (r0) to roundkey ptr (r12) mod 16
  ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
  ldr r2,[r0,#16]             @ barrier load

  rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
 @ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot
 @ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr
 .if RK_ROR
  movs r0,r2,lsl#3
  movs r1,r1,ror r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1;                   rors r0,r0,r1; eors r4,r4,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2];                movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0
 .else
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r4,r4,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r5,r5,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r6,r6,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2];                eors r7,r7,r0
 .endif
  clear03_preserve_r3
  add r12,r12,#20
  @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr

  bfi r0,r12,#0,#4            @ match chaff pointer (r0) to roundkey ptr (r12) mod 16
  ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
  ldr r2,[r0,#16]             @ barrier load
  rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
  ldr r3,=RKshareC            @ r3=common round key shareC
  bfi r0,r3,#0,#4
  ldr r3,[r3]
  ldr r0,[r0]                 @ barrier load

 @ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot
 @ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr
 .if RK_ROR
  movs r0,r2,lsl#3
  movs r1,r1,ror r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16;   adds r2,r2,#1;                   rors r0,r0,r1; eor r8,r8,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16;   adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r9,r9,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r10,r10,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16;                movs r1,r1,ror#8; rors r0,r0,r1; eor r11,r11,r0
 .else
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16;   adds r2,r2,#1; eors r8,r8,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16;   adds r2,r2,#1; eors r9,r9,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; eors r10,r10,r0
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16;                eors r11,r11,r0
 .endif
  clear03
  bx r14

 .balign 4
 .thumb_func
 @ de/encrypt data in place
 @ r0: ivec
 @ r1: buf
 @ r2: n, number of blocks, n>0
 .if CT_BPERM
 @ In AES-CTR each block can be independently en/decrypted as the encryption only depends on the IV,
 @ the key, and the block number. We can therefore process them in any order, and using a
 @ random order helps to defeat attacks that work on the output of the AES, since an attacker
 @ wouldn't know what plaintext or ciphertext corresponds to a particular instruction.
 .endif

 ctr_crypt_s:
 @ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks
  GET_CANARY r12,CTAG0,6
  push {r0-r12,r14}           @ save all registers so that when we restore we overwrite any secrets

  push {r0-r3}

 #if !CALLER_INIT_RCP_COUNT
  SET_COUNT 33,6
 #endif

 .if CT_BPERM
 @ Initialise 32 random numbers (which fit in half-words)
 @ r3=number of blocks
  ldr r4,=bperm_rand
  movs r5,#32
 1:
  bl gen_rand_sha
  umull r0,r2,r0,r3        @ Random number between 0 and n-1 (n=#blocks)
  strh r2,[r4],#2
  subs r5,r5,#1
  bne 1b
 .endif

  bl randomisechaff

 @ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0
 @ Not doing shareC or state vperm at this point
  pop {r0}
  ldmia r0,{r4-r7}         @ r4-r7 = IVshareA
  clear03 16
  pop {r1}
  ldmia r1,{r8-r11}        @ r8-r11 = IVshareB
  clear03 32
  bl gen_rand_sha_nonpres; eors r4,r4,r0; movs r1,#0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16   @ Barriers between shares to prevent implicit r4^r8 etc
  bl gen_rand_sha_nonpres; eors r5,r5,r0; movs r1,#0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16
  bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16
  bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16
  ldr r0,=IV0
  stmia r0!,{r4-r7}
  adds r1,r0,#4
  stmia r1,{r8-r11}
 @ "Decommission" IV0 so that it doesn't get stacked
 #if 1 // approved by Alex - no side channel leakage it seems
 #if HARDENING
  // if this is skipped, r4 is likely random, so more 1 in 4 chance that ldmia will trap
  // in any case very unlikely to load useful data below (and presuambly the faulting address is uninteresting
  // since it is already XORed with random data above)
  movs r0, #32
  // note if r1 is unset, then we are reading from lut_a
  movs r1, #0
  ldmia r1!, {r4, r5, r6, r7, r8, r9, r10, r11}
  rcp_iequal_nodelay r0, r1
 #else
  movs r0, #0
  ldmia r0, {r4, r5, r6, r7, r8, r9, r10, r11}
 #endif
 #else
  bl gen_rand_sha_nonpres; movs r4,r0
  bl gen_rand_sha_nonpres; movs r5,r0
  bl gen_rand_sha_nonpres; movs r6,r0
  bl gen_rand_sha_nonpres; movs r7,r0
  bl gen_rand_sha_nonpres; mov  r8,r0
  bl gen_rand_sha_nonpres; mov  r9,r0
  bl gen_rand_sha_nonpres; mov r10,r0
  bl gen_rand_sha_nonpres; mov r11,r0
 #endif
 @ Trashes r0, r1
  check_rnd_count (RND_COUNT_decrypt+RND_COUNT_ctr_crypt_s_init)
  pop {r1,r2}
 @ r1=cipher/plaintext buffer, r2=number of blocks

  movs r3,#0
  CHK_COUNT 33,6

 ctr_crypt_mainloop:
  SET_COUNT 80,6
 @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter

 @ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it)
  push {r1-r3}
 @ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret)

 @ Trashes r0, r1
  reset_rnd_count_checked

  // no point in having a branch if we should never take it (hardening/size fail)
 #if REFCHAFF_PERIOD != 1
  tst r3,#(REFCHAFF_PERIOD-1)
  bne 1f
 #endif
  bl refreshchaff_and_lfsr
 1:
  ldr r3,[sp,#8]             @ get block count off the stack
  // no point in having a branch if we should never take it (hardening/size fail)
 #if REMAP_PERIOD != 1
  tst r3,#(REMAP_PERIOD-1)
  bne 1f
 #endif
  bl remap                    @ shuffle the LUTs; this preserves R3
 1:

  CHK_COUNT 80,6
  ldr r0,[sp,#8]             @ get block count off the stack
 #if HARDENING
 @ We check the random counts here. Note we start with the combined count and subtract, just because
 @ it might make it marginally more difficult to get the right answer if skipping multiple instructions
  movs r1, #(RND_COUNT_remap + RND_COUNT_refreshchaff_and_lfsr)
 #if REMAP_PERIOD != 1
  tst r0, #(REMAP_PERIOD-1)
  it ne
  subne r1, #RND_COUNT_remap
 #endif
 #if REFCHAFF_PERIOD != 1
  tst r0, #(REFCHAFF_PERIOD-1)
  it ne
  subne r1, #RND_COUNT_refreshchaff_and_lfsr
 #endif
 @ r0=block count, r1=expected sha rand count, r3=block count
  rcp_iequal_nodelay r0, r3
 @ r1=expected sha rand count, r3=block count
  check_rnd_count_dynamic
 #endif // HARDENING
 @ r3=block count

 @ No point in having a branch if we should never take it (hardening/size fail)
 #if REFROUNDKEYSHARES_PERIOD != 1
 #if HARDENING
 // we want to check that we are calling enough
 #warning REFROUNDKEYSHARES_PERIOD check needs hardening
 #endif
  tst r3,#(REFROUNDKEYSHARES_PERIOD-1)
  bne skip_ref_roundkey_shares_s
 #endif
 #if INLINE_REF_ROUNDKEY_SHARES_S
  inline_ref_roundkey_shares_s
 #else
 #if HARDENING
  // todo graham we could remove this for space, as I don't think r4 and r5 are equal
 @ Make sure r4 != r5 on entry to ref_roundkey_shares_s
  subs r4, r5, #1
 #endif
  bl ref_roundkey_shares_s    @ refresh the round key shares
 #if HARDENING
 @ r4 and r5 are set equal by ref_roundkey_shares (note we don't do a rnd_check as no sha random numbers are generated)
  rcp_iequal_nodelay r4, r5
 #endif
 #endif
 skip_ref_roundkey_shares_s:

 #if REFROUNDKEYHVPERMS_PERIOD != 1
 #if HARDENING
 // we want to check that we are calling enough
 #warning REFROUNDKEYHVPERMS_PERIOD check needs hardening
 #endif
  ldr r3,[sp,#8]             @ get block count off the stack
  tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1)
  bne skip_ref_roundkey_hvperm_s
 #endif
 #if INLINE_REF_ROUNDKEY_HVPERMS_S
  inline_ref_roundkey_hvperms_s
 #else
  bl ref_roundkey_hvperms_s   @ refresh the round key vperms
 #if HARDENING
  movs r0, #30
 @ r7 should be 30 on exit from ref_roundkey_hvperms_s
  rcp_iequal_nodelay r0, r7
 #endif
 #endif
 skip_ref_roundkey_hvperms_s:

  CHK_COUNT 81,6

 @ Trashes r0, r1
  reset_rnd_count
  pop {r1-r3}
 @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter

 @ Now calculate r12 = block number-to-be-deciphered from r3 = block counter
 .if CT_BPERM
 @ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7
  push {r1}
  ldr r0,=murmur3_constants
  ldmia r0,{r9-r12,r14}       @ load five murmur3_32 hash constants
  ldr r0,=bperm_rand
  movs r1,#31
  movs r4,r3                  @ r4=i
 1:
  ldrh r5,[r0],#2             @ r5=k
  subs r5,r5,r4               @ r5=k-i
  ands r6,r2,r5,asr#31        @ r6=n*(k-i<0)
  adds r5,r5,r6               @ r5=j=(k-i)%n
  adds r6,r4,r5               @ r6=i+j
  subs r7,r4,r5               @ r7=i-j
  and  r8,r7,r7,asr#31        @ r8=min(i-j,0)
  sub  r7,r7,r8,lsl#1         @ r7=|i-j|
  mla  r6,r6,r2,r7            @ r6=n(i+j)+|i-j|, encodes the unordered pair {i,j}
  eors r6,r6,r1,lsl#27        @ mix with swap-or-not round counter to get different hash functions
 @ Now do murmur3_32 hash of r6
  mul  r6,r6,r9
  movs r6,r6,ror#17
  mul  r6,r6,r10
  movs r6,r6,ror#19
  adds r6,r6,r6,lsl#2
  add  r6,r6,r11
  eors r6,r6,#4
  eors r6,r6,r6,lsr#16
  mul  r6,r6,r12
  eors r6,r6,r6,lsr#13
  mul  r6,r6,r14
  eors r6,r6,r6,lsr#16        @ not actually used here
 @ Now set i to j, conditional on the top bit of r6
  subs r7,r5,r4               @ r7=j-i
  ands r7,r7,r6,asr#31        @ r7=(j-i)*(top bit of r6)
  adds r4,r4,r7               @ r4=j if top bit of r6, else i
  subs r1,r1,#1
  bpl 1b
  // tooo loop check
  pop {r1}
  mov r12,r4
 .else
  mov r12,r3
 .endif
  CHK_COUNT 82,6

 @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered)
  push {r1-r3,r12}
 @ r4-r11 = IV0, r12=block number

 processIV:                   @ non-target label to assist power analysis
  ldr r8,=IV0
  ldmia r8,{r4-r7}            @ load IV0_A
  clear03 16
  add r8,r8,#20
  ldmia r8,{r8-r11}           @ load IV0_B
  clear03 32
  rev r0,r12
  eor r7,r7,r0                @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n.
                              @ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n)
 @ r4-r11 = IV for the current block
  CHK_COUNT 83,6
 .if ST_SHAREC
  bl gen_rand_sha_nonpres     @ Create state share C; all bytes the same
  ands r0,r0,#255
  orrs r0,r0,r0,lsl#8
  orrs r12,r0,r0,lsl#16
  ldr r1,=shareC
  str r12,[r1]
 .else
  movs r12,#0
 .endif
 @ r4-r11 = IV for the current block w/o shareC, r12=shareC
 @ refresh state shares and mix in shareC
  bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16   @ Barriers between shares to prevent implicit r4^r8 etc
  bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16
  bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16
  bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16
 .if ST_VPERM
  bl gen_rand_sha_nonpres
  ldr r1,=statevperm
  movs r2,#0
  str r2,[r1]
  bl addstatevperm            @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG)
 #if HARDENING
  // r1 is set to lut_b by addstatevperm
  ldr r0, =shareB + 0x10
  rcp_iequal_nodelay r0, r1
 #endif
 .endif

 @ Trashes r0, r1
  check_rnd_count RND_COUNT_ctr_crypt_mainloop_A
  CHK_COUNT 84,6
 .if ST_SHAREC                @ Avoid func call if the func is empty
  bl conjshareC               @ Add the effect of shareC to lut_a, lut_b
 #if HARDENING
  // r1 is set to lut_b by conjshare
  ldr r2,=lut_b
  rcp_iequal_nodelay r1, r2
 #endif
 .endif
  // todo graham remove this count
  CHK_COUNT 85,6
 @ now perform the 15 encryption rounds on (key, state=IV+x)
 @ here r4-r7, r8-r11: state
  mov r2,#0                   @ round counter
 rounds_s_mainloop:
 @ Trashes r0, r1
  reset_rnd_count_checked
  ldr r12,=rkey_s
  add r12,r12,r2,lsl#5        @ pointer to key shares for this round
  add r12,r12,r2,lsl#3
  push {r2}                   @ save round count
  bl addrkey_s
 .if INLINE_MAP_SBOX_S
  inline_map_sbox_s
 .else
  bl map_sbox_s
 .endif
 .if INLINE_SHIFT_ROWS_S
  inline_shift_rows_s
 .else
  bl shift_rows_s
 .endif
 .if ST_VPERM
  ldr r2,[sp]                @ peek at stack to get round count
  cmp r2,#NUMREFSTATEVPERM
  bcs 1f
  bl gen_rand_lfsr_nonpres
  ldr r1,=statevperm
  bl addstatevperm            @ V shuffle of r4-r11
 #if HARDENING
  // r1 is set to lut_b by addstatevperm
  ldr r2, =shareB + 0x10
  rcp_iequal_nodelay r1, r2
 #endif
 1:
 .endif
  pop {r2}
  adds r2,r2,#1               @ increment round counter
  cmp r2,#14
  beq 2f                      @ break from loop? (last round has no mix_cols)
  push {r2}
  bl mix_cols_s
  pop {r2}
  b rounds_s_mainloop
 2:
 #if HARDENING
  movs r1, #14
  rcp_iequal_nodelay r1, r2
 #endif
  CHK_COUNT 86,6
  ldr r12,=rkey_s+14*40       @ final round key shares
  // todo graham check this is called
  bl addrkey_s
  CHK_COUNT 87,6
 .if ST_SHAREC                @ Avoid func call if the func is empty
  // todo alex, i assume that skipping this will cause bad things to happen anyway?
  bl conjshareC               @ Undo the effect of shareC from lut_a, lut_b
 .endif
  CHK_COUNT 88,6
 .if ST_VPERM
 @ Undo the effects of vperm rotation recorded in statevperm
  ldr r1,=statevperm
  ldr r2,[r1]
  rsbs r0,r2,#0
 @ We don't check this is called since failing to undo this is probably going to break decryption
 // todo alex is this fair?
  bl addstatevperm
 .endif

  pop {r1-r3,r12}
  push {r1,r3}
 @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered

 decryption_start:
 @ Decrypt ciphertext using AES output in shares: r4-r11
 .if ST_SHAREC
  ldr r0,=shareC
  ldr r0,[r0]
 .else
  movs r0,#0
 .endif
  ldr r14,=chaff
 @ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff
  CHK_COUNT 89,6
  add r1,r1,r12,lsl#4         @ Temporarily r1 points to block-to-be-deciphered
  ldr r3,[r1]                 @ r3=ciphertext word
  eors r3,r3,r4               @ r3=r3^shareA
  ldr r4,[r14]                @ barrier load
  eor r3,r3,r8,ror#16         @ r3=r3^shareB
  eors r3,r3,r0               @ r3=r3^shareC
  str r3,[r1]                 @ plaintext word=r3
  ldr r3,[r1,#4]              @ and similarly for words 1,2,3 of block...
  ldr r4,[r14,#4]
  eors r3,r3,r5
  eor r3,r3,r9,ror#16
  eors r3,r3,r0
  str r3,[r1,#4]
  ldr r3,[r1,#8]
  ldr r4,[r14,#8]
  eors r3,r3,r6
  eor r3,r3,r10,ror#16
  eors r3,r3,r0
  str r3,[r1,#8]
  ldr r3,[r1,#12]
  ldr r4,[r14,#12]
  eors r3,r3,r7
  eor r3,r3,r11,ror#16
  eors r3,r3,r0
  str r3,[r1,#12]

  CHK_COUNT 90,6

 @ Trashes r0, r1
  check_rnd_count RND_COUNT_decryption_end

  pop {r1,r3}                  @ Restore r1 to point to start of buffer
                               @ Restore block counter
 @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
 decryption_end:

  adds r3,r3,#1
  cmp r3,r2
  CHK_COUNT 91,6
  bne ctr_crypt_mainloop
  // todo alex, is this necessary - if you don't do the right number of loops, you ain't gonna get far?
 #if HARDENING
  rcp_iequal_nodelay r2, r3
 #endif

 #if WIPE_MEMORY
 @ Wipe memory from workspace_start up to the stack pointer
 @ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals
  ldr r4,=workspace_start
  add r5, r4, #rstate_all_start - workspace_start
 #if HARDENING
  ldr r7,=workspace_start
  add r6, r4, #rstate_all_start - workspace_start
  rcp_iequal_nodelay r4, r7
 #endif
 #if HARDENING
  // todo alex, is this necessary - if you don't do the right number of loops, you ain't gonna get far?
 @ Recheck of above
  rcp_iequal_nodelay r3, r2
 #endif
 1:
  bl gen_rand_sha_nonpres
  stmia r4!,{r0}
  cmp r4,r5
  bcc 1b
 #if HARDENING
  rcp_iequal_nodelay r4, r6
  mov r6,sp
 #endif
  // not if this load is skpped, then we are just erasing from where we left off before
 .if rstate_all_end <= rstate_all_start
 .err
 .endif
  ldr r4,=rstate_all_end
  mov r5,sp                  @ gcc arm assembler says cmp r4,sp is deprecated, so use another register
 1:
  bl gen_rand_sha_nonpres
  stmia r4!,{r0}
  cmp r4,r5
  bcc 1b
 #if HARDENING
  rcp_iequal_nodelay r4, r6
 #endif

 @ Then fill everything with zeros so as not to leave behind clues about the RNG state
  ldr r4,=workspace_start
  movs r0,#0
  mov r5,sp
 1:
  stmia r4!,{r0}
  cmp r4,r5
  bcc 1b
 #if HARDENING
  rcp_iequal_nodelay r4, r6
 #endif
 #endif

 .if GEN_RAND_SHA
  SET_COUNT 23,6
  bl reset_sha_trng           @ clear out the SHA hardware
 .endif
  pop {r0-r12,r14}
  CHK_CANARY r12,CTAG0,6
  bx r14