blob: e3c8cb15e705c06da5c1a6e9af3eeacdf15f8eb0 [file] [log] [blame]
Kazuho Oku3aa0c152016-11-01 13:48:56 +09001/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
2
3#ifndef _UECC_ASM_ARM_H_
4#define _UECC_ASM_ARM_H_
5
6#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
7 #define uECC_MIN_WORDS 8
8#endif
9#if uECC_SUPPORTS_secp224r1
10 #undef uECC_MIN_WORDS
11 #define uECC_MIN_WORDS 7
12#endif
13#if uECC_SUPPORTS_secp192r1
14 #undef uECC_MIN_WORDS
15 #define uECC_MIN_WORDS 6
16#endif
17#if uECC_SUPPORTS_secp160r1
18 #undef uECC_MIN_WORDS
19 #define uECC_MIN_WORDS 5
20#endif
21
22#if (uECC_PLATFORM == uECC_arm_thumb)
23 #define REG_RW "+l"
24 #define REG_WRITE "=l"
25#else
26 #define REG_RW "+r"
27 #define REG_WRITE "=r"
28#endif
29
30#if (uECC_PLATFORM == uECC_arm_thumb || uECC_PLATFORM == uECC_arm_thumb2)
31 #define REG_RW_LO "+l"
32 #define REG_WRITE_LO "=l"
33#else
34 #define REG_RW_LO "+r"
35 #define REG_WRITE_LO "=r"
36#endif
37
38#if (uECC_PLATFORM == uECC_arm_thumb2)
39 #define RESUME_SYNTAX
40#else
41 #define RESUME_SYNTAX ".syntax divided \n\t"
42#endif
43
44#if (uECC_OPTIMIZATION_LEVEL >= 2)
45
46uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
47 const uECC_word_t *left,
48 const uECC_word_t *right,
49 wordcount_t num_words) {
50#if (uECC_MAX_WORDS != uECC_MIN_WORDS)
51 #if (uECC_PLATFORM == uECC_arm_thumb) || (uECC_PLATFORM == uECC_arm_thumb2)
52 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 2 + 1;
53 #else /* ARM */
54 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 4;
55 #endif
56#endif
57 uint32_t carry;
58 uint32_t left_word;
59 uint32_t right_word;
60
61 __asm__ volatile (
62 ".syntax unified \n\t"
63 "movs %[carry], #0 \n\t"
64 #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
65 "adr %[left], 1f \n\t"
66 ".align 4 \n\t"
67 "adds %[jump], %[left] \n\t"
68 #endif
69
70 "ldmia %[lptr]!, {%[left]} \n\t"
71 "ldmia %[rptr]!, {%[right]} \n\t"
72 "adds %[left], %[right] \n\t"
73 "stmia %[dptr]!, {%[left]} \n\t"
74
75 #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
76 "bx %[jump] \n\t"
77 #endif
78 "1: \n\t"
79 REPEAT(DEC(uECC_MAX_WORDS),
80 "ldmia %[lptr]!, {%[left]} \n\t"
81 "ldmia %[rptr]!, {%[right]} \n\t"
82 "adcs %[left], %[right] \n\t"
83 "stmia %[dptr]!, {%[left]} \n\t")
84
85 "adcs %[carry], %[carry] \n\t"
86 RESUME_SYNTAX
87 : [dptr] REG_RW_LO (result), [lptr] REG_RW_LO (left), [rptr] REG_RW_LO (right),
88 #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
89 [jump] REG_RW_LO (jump),
90 #endif
91 [carry] REG_WRITE_LO (carry), [left] REG_WRITE_LO (left_word),
92 [right] REG_WRITE_LO (right_word)
93 :
94 : "cc", "memory"
95 );
96 return carry;
97}
98#define asm_add 1
99
100uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
101 const uECC_word_t *left,
102 const uECC_word_t *right,
103 wordcount_t num_words) {
104#if (uECC_MAX_WORDS != uECC_MIN_WORDS)
105 #if (uECC_PLATFORM == uECC_arm_thumb) || (uECC_PLATFORM == uECC_arm_thumb2)
106 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 2 + 1;
107 #else /* ARM */
108 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 4;
109 #endif
110#endif
111 uint32_t carry;
112 uint32_t left_word;
113 uint32_t right_word;
114
115 __asm__ volatile (
116 ".syntax unified \n\t"
117 "movs %[carry], #0 \n\t"
118 #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
119 "adr %[left], 1f \n\t"
120 ".align 4 \n\t"
121 "adds %[jump], %[left] \n\t"
122 #endif
123
124 "ldmia %[lptr]!, {%[left]} \n\t"
125 "ldmia %[rptr]!, {%[right]} \n\t"
126 "subs %[left], %[right] \n\t"
127 "stmia %[dptr]!, {%[left]} \n\t"
128
129 #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
130 "bx %[jump] \n\t"
131 #endif
132 "1: \n\t"
133 REPEAT(DEC(uECC_MAX_WORDS),
134 "ldmia %[lptr]!, {%[left]} \n\t"
135 "ldmia %[rptr]!, {%[right]} \n\t"
136 "sbcs %[left], %[right] \n\t"
137 "stmia %[dptr]!, {%[left]} \n\t")
138
139 "adcs %[carry], %[carry] \n\t"
140 RESUME_SYNTAX
141 : [dptr] REG_RW_LO (result), [lptr] REG_RW_LO (left), [rptr] REG_RW_LO (right),
142 #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
143 [jump] REG_RW_LO (jump),
144 #endif
145 [carry] REG_WRITE_LO (carry), [left] REG_WRITE_LO (left_word),
146 [right] REG_WRITE_LO (right_word)
147 :
148 : "cc", "memory"
149 );
150 return !carry; /* Note that on ARM, carry flag set means "no borrow" when subtracting
151 (for some reason...) */
152}
153#define asm_sub 1
154
155#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
156
157#if (uECC_OPTIMIZATION_LEVEL >= 3)
158
159#if (uECC_PLATFORM != uECC_arm_thumb)
160
161#if uECC_ARM_USE_UMAAL
162 #include "asm_arm_mult_square_umaal.inc"
163#else
164 #include "asm_arm_mult_square.inc"
165#endif
166
167#if (uECC_OPTIMIZATION_LEVEL == 3)
168
169uECC_VLI_API void uECC_vli_mult(uint32_t *result,
170 const uint32_t *left,
171 const uint32_t *right,
172 wordcount_t num_words) {
173 register uint32_t *r0 __asm__("r0") = result;
174 register const uint32_t *r1 __asm__("r1") = left;
175 register const uint32_t *r2 __asm__("r2") = right;
176 register uint32_t r3 __asm__("r3") = num_words;
177
178 __asm__ volatile (
179 ".syntax unified \n\t"
180#if (uECC_MIN_WORDS == 5)
181 FAST_MULT_ASM_5
182 #if (uECC_MAX_WORDS > 5)
183 FAST_MULT_ASM_5_TO_6
184 #endif
185 #if (uECC_MAX_WORDS > 6)
186 FAST_MULT_ASM_6_TO_7
187 #endif
188 #if (uECC_MAX_WORDS > 7)
189 FAST_MULT_ASM_7_TO_8
190 #endif
191#elif (uECC_MIN_WORDS == 6)
192 FAST_MULT_ASM_6
193 #if (uECC_MAX_WORDS > 6)
194 FAST_MULT_ASM_6_TO_7
195 #endif
196 #if (uECC_MAX_WORDS > 7)
197 FAST_MULT_ASM_7_TO_8
198 #endif
199#elif (uECC_MIN_WORDS == 7)
200 FAST_MULT_ASM_7
201 #if (uECC_MAX_WORDS > 7)
202 FAST_MULT_ASM_7_TO_8
203 #endif
204#elif (uECC_MIN_WORDS == 8)
205 FAST_MULT_ASM_8
206#endif
207 "1: \n\t"
208 RESUME_SYNTAX
209 : "+r" (r0), "+r" (r1), "+r" (r2)
210 : "r" (r3)
211 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
212 );
213}
214#define asm_mult 1
215
216#if uECC_SQUARE_FUNC
217uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
218 const uECC_word_t *left,
219 wordcount_t num_words) {
220 register uint32_t *r0 __asm__("r0") = result;
221 register const uint32_t *r1 __asm__("r1") = left;
222 register uint32_t r2 __asm__("r2") = num_words;
223
224 __asm__ volatile (
225 ".syntax unified \n\t"
226#if (uECC_MIN_WORDS == 5)
227 FAST_SQUARE_ASM_5
228 #if (uECC_MAX_WORDS > 5)
229 FAST_SQUARE_ASM_5_TO_6
230 #endif
231 #if (uECC_MAX_WORDS > 6)
232 FAST_SQUARE_ASM_6_TO_7
233 #endif
234 #if (uECC_MAX_WORDS > 7)
235 FAST_SQUARE_ASM_7_TO_8
236 #endif
237#elif (uECC_MIN_WORDS == 6)
238 FAST_SQUARE_ASM_6
239 #if (uECC_MAX_WORDS > 6)
240 FAST_SQUARE_ASM_6_TO_7
241 #endif
242 #if (uECC_MAX_WORDS > 7)
243 FAST_SQUARE_ASM_7_TO_8
244 #endif
245#elif (uECC_MIN_WORDS == 7)
246 FAST_SQUARE_ASM_7
247 #if (uECC_MAX_WORDS > 7)
248 FAST_SQUARE_ASM_7_TO_8
249 #endif
250#elif (uECC_MIN_WORDS == 8)
251 FAST_SQUARE_ASM_8
252#endif
253
254 "1: \n\t"
255 RESUME_SYNTAX
256 : "+r" (r0), "+r" (r1)
257 : "r" (r2)
258 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
259 );
260}
261#define asm_square 1
262#endif /* uECC_SQUARE_FUNC */
263
264#else /* (uECC_OPTIMIZATION_LEVEL > 3) */
265
266uECC_VLI_API void uECC_vli_mult(uint32_t *result,
267 const uint32_t *left,
268 const uint32_t *right,
269 wordcount_t num_words) {
270 register uint32_t *r0 __asm__("r0") = result;
271 register const uint32_t *r1 __asm__("r1") = left;
272 register const uint32_t *r2 __asm__("r2") = right;
273 register uint32_t r3 __asm__("r3") = num_words;
274
275#if uECC_SUPPORTS_secp160r1
276 if (num_words == 5) {
277 __asm__ volatile (
278 ".syntax unified \n\t"
279 FAST_MULT_ASM_5
280 RESUME_SYNTAX
281 : "+r" (r0), "+r" (r1), "+r" (r2)
282 : "r" (r3)
283 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
284 );
285 return;
286 }
287#endif
288#if uECC_SUPPORTS_secp192r1
289 if (num_words == 6) {
290 __asm__ volatile (
291 ".syntax unified \n\t"
292 FAST_MULT_ASM_6
293 RESUME_SYNTAX
294 : "+r" (r0), "+r" (r1), "+r" (r2)
295 : "r" (r3)
296 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
297 );
298 return;
299 }
300#endif
301#if uECC_SUPPORTS_secp224r1
302 if (num_words == 7) {
303 __asm__ volatile (
304 ".syntax unified \n\t"
305 FAST_MULT_ASM_7
306 RESUME_SYNTAX
307 : "+r" (r0), "+r" (r1), "+r" (r2)
308 : "r" (r3)
309 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
310 );
311 return;
312 }
313#endif
314#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
315 if (num_words == 8) {
316 __asm__ volatile (
317 ".syntax unified \n\t"
318 FAST_MULT_ASM_8
319 RESUME_SYNTAX
320 : "+r" (r0), "+r" (r1), "+r" (r2)
321 : "r" (r3)
322 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
323 );
324 return;
325 }
326#endif
327}
328#define asm_mult 1
329
330#if uECC_SQUARE_FUNC
331uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
332 const uECC_word_t *left,
333 wordcount_t num_words) {
334 register uint32_t *r0 __asm__("r0") = result;
335 register const uint32_t *r1 __asm__("r1") = left;
336 register uint32_t r2 __asm__("r2") = num_words;
337
338#if uECC_SUPPORTS_secp160r1
339 if (num_words == 5) {
340 __asm__ volatile (
341 ".syntax unified \n\t"
342 FAST_SQUARE_ASM_5
343 RESUME_SYNTAX
344 : "+r" (r0), "+r" (r1)
345 : "r" (r2)
346 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
347 );
348 return;
349 }
350#endif
351#if uECC_SUPPORTS_secp192r1
352 if (num_words == 6) {
353 __asm__ volatile (
354 ".syntax unified \n\t"
355 FAST_SQUARE_ASM_6
356 RESUME_SYNTAX
357 : "+r" (r0), "+r" (r1)
358 : "r" (r2)
359 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
360 );
361 return;
362 }
363#endif
364#if uECC_SUPPORTS_secp224r1
365 if (num_words == 7) {
366 __asm__ volatile (
367 ".syntax unified \n\t"
368 FAST_SQUARE_ASM_7
369 RESUME_SYNTAX
370 : "+r" (r0), "+r" (r1)
371 : "r" (r2)
372 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
373 );
374 return;
375 }
376#endif
377#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
378 if (num_words == 8) {
379 __asm__ volatile (
380 ".syntax unified \n\t"
381 FAST_SQUARE_ASM_8
382 RESUME_SYNTAX
383 : "+r" (r0), "+r" (r1)
384 : "r" (r2)
385 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
386 );
387 return;
388 }
389#endif
390}
391#define asm_square 1
392#endif /* uECC_SQUARE_FUNC */
393
394#endif /* (uECC_OPTIMIZATION_LEVEL > 3) */
395
396#endif /* uECC_PLATFORM != uECC_arm_thumb */
397
398#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
399
400/* ---- "Small" implementations ---- */
401
402#if !asm_add
403uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
404 const uECC_word_t *left,
405 const uECC_word_t *right,
406 wordcount_t num_words) {
407 uint32_t carry = 0;
408 uint32_t left_word;
409 uint32_t right_word;
410
411 __asm__ volatile (
412 ".syntax unified \n\t"
413 "1: \n\t"
414 "ldmia %[lptr]!, {%[left]} \n\t" /* Load left word. */
415 "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
416 "lsrs %[carry], #1 \n\t" /* Set up carry flag (carry = 0 after this). */
417 "adcs %[left], %[left], %[right] \n\t" /* Add with carry. */
418 "adcs %[carry], %[carry], %[carry] \n\t" /* Store carry bit. */
419 "stmia %[dptr]!, {%[left]} \n\t" /* Store result word. */
420 "subs %[ctr], #1 \n\t" /* Decrement counter. */
421 "bne 1b \n\t" /* Loop until counter == 0. */
422 RESUME_SYNTAX
423 : [dptr] REG_RW (result), [lptr] REG_RW (left), [rptr] REG_RW (right),
424 [ctr] REG_RW (num_words), [carry] REG_RW (carry),
425 [left] REG_WRITE (left_word), [right] REG_WRITE (right_word)
426 :
427 : "cc", "memory"
428 );
429 return carry;
430}
431#define asm_add 1
432#endif
433
434#if !asm_sub
435uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
436 const uECC_word_t *left,
437 const uECC_word_t *right,
438 wordcount_t num_words) {
439 uint32_t carry = 1; /* carry = 1 initially (means don't borrow) */
440 uint32_t left_word;
441 uint32_t right_word;
442
443 __asm__ volatile (
444 ".syntax unified \n\t"
445 "1: \n\t"
446 "ldmia %[lptr]!, {%[left]} \n\t" /* Load left word. */
447 "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
448 "lsrs %[carry], #1 \n\t" /* Set up carry flag (carry = 0 after this). */
449 "sbcs %[left], %[left], %[right] \n\t" /* Subtract with borrow. */
450 "adcs %[carry], %[carry], %[carry] \n\t" /* Store carry bit. */
451 "stmia %[dptr]!, {%[left]} \n\t" /* Store result word. */
452 "subs %[ctr], #1 \n\t" /* Decrement counter. */
453 "bne 1b \n\t" /* Loop until counter == 0. */
454 RESUME_SYNTAX
455 : [dptr] REG_RW (result), [lptr] REG_RW (left), [rptr] REG_RW (right),
456 [ctr] REG_RW (num_words), [carry] REG_RW (carry),
457 [left] REG_WRITE (left_word), [right] REG_WRITE (right_word)
458 :
459 : "cc", "memory"
460 );
461 return !carry;
462}
463#define asm_sub 1
464#endif
465
466#if !asm_mult
467uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
468 const uECC_word_t *left,
469 const uECC_word_t *right,
470 wordcount_t num_words) {
471#if (uECC_PLATFORM != uECC_arm_thumb)
472 uint32_t c0 = 0;
473 uint32_t c1 = 0;
474 uint32_t c2 = 0;
475 uint32_t k = 0;
476 uint32_t i;
477 uint32_t t0, t1;
478
479 __asm__ volatile (
480 ".syntax unified \n\t"
481
482 "1: \n\t" /* outer loop (k < num_words) */
483 "movs %[i], #0 \n\t" /* i = 0 */
484 "b 3f \n\t"
485
486 "2: \n\t" /* outer loop (k >= num_words) */
487 "movs %[i], %[k] \n\t" /* i = k */
488 "subs %[i], %[last_word] \n\t" /* i = k - (num_words - 1) (times 4) */
489
490 "3: \n\t" /* inner loop */
491 "subs %[t0], %[k], %[i] \n\t" /* t0 = k-i */
492
493 "ldr %[t1], [%[right], %[t0]] \n\t" /* t1 = right[k - i] */
494 "ldr %[t0], [%[left], %[i]] \n\t" /* t0 = left[i] */
495
496 "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */
497
498 "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
499 "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
500 "adcs %[c2], %[c2], #0 \n\t" /* add carry to c2 */
501
502 "adds %[i], #4 \n\t" /* i += 4 */
503 "cmp %[i], %[last_word] \n\t" /* i > (num_words - 1) (times 4)? */
504 "bgt 4f \n\t" /* if so, exit the loop */
505 "cmp %[i], %[k] \n\t" /* i <= k? */
506 "ble 3b \n\t" /* if so, continue looping */
507
508 "4: \n\t" /* end inner loop */
509
510 "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */
511 "mov %[c0], %[c1] \n\t" /* c0 = c1 */
512 "mov %[c1], %[c2] \n\t" /* c1 = c2 */
513 "movs %[c2], #0 \n\t" /* c2 = 0 */
514 "adds %[k], #4 \n\t" /* k += 4 */
515 "cmp %[k], %[last_word] \n\t" /* k <= (num_words - 1) (times 4) ? */
516 "ble 1b \n\t" /* if so, loop back, start with i = 0 */
517 "cmp %[k], %[last_word], lsl #1 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */
518 "ble 2b \n\t" /* if so, loop back, start with i = (k + 1) - num_words */
519 /* end outer loop */
520
521 "str %[c0], [%[result], %[k]] \n\t" /* result[num_words * 2 - 1] = c0 */
522 RESUME_SYNTAX
523 : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2),
524 [k] "+r" (k), [i] "=&r" (i), [t0] "=&r" (t0), [t1] "=&r" (t1)
525 : [result] "r" (result), [left] "r" (left), [right] "r" (right),
526 [last_word] "r" ((num_words - 1) * 4)
527 : "cc", "memory"
528 );
529
530#else /* Thumb-1 */
531 uint32_t r4, r5, r6, r7;
532
533 __asm__ volatile (
534 ".syntax unified \n\t"
535 "subs %[r3], #1 \n\t" /* r3 = num_words - 1 */
536 "lsls %[r3], #2 \n\t" /* r3 = (num_words - 1) * 4 */
537 "mov r8, %[r3] \n\t" /* r8 = (num_words - 1) * 4 */
538 "lsls %[r3], #1 \n\t" /* r3 = (num_words - 1) * 8 */
539 "mov r9, %[r3] \n\t" /* r9 = (num_words - 1) * 8 */
540 "movs %[r3], #0 \n\t" /* c0 = 0 */
541 "movs %[r4], #0 \n\t" /* c1 = 0 */
542 "movs %[r5], #0 \n\t" /* c2 = 0 */
543 "movs %[r6], #0 \n\t" /* k = 0 */
544
545 "push {%[r0]} \n\t" /* keep result on the stack */
546
547 "1: \n\t" /* outer loop (k < num_words) */
548 "movs %[r7], #0 \n\t" /* r7 = i = 0 */
549 "b 3f \n\t"
550
551 "2: \n\t" /* outer loop (k >= num_words) */
552 "movs %[r7], %[r6] \n\t" /* r7 = k */
553 "mov %[r0], r8 \n\t" /* r0 = (num_words - 1) * 4 */
554 "subs %[r7], %[r0] \n\t" /* r7 = i = k - (num_words - 1) (times 4) */
555
556 "3: \n\t" /* inner loop */
557 "mov r10, %[r3] \n\t"
558 "mov r11, %[r4] \n\t"
559 "mov r12, %[r5] \n\t"
560 "mov r14, %[r6] \n\t"
561 "subs %[r0], %[r6], %[r7] \n\t" /* r0 = k - i */
562
563 "ldr %[r4], [%[r2], %[r0]] \n\t" /* r4 = right[k - i] */
564 "ldr %[r0], [%[r1], %[r7]] \n\t" /* r0 = left[i] */
565
566 "lsrs %[r3], %[r0], #16 \n\t" /* r3 = a1 */
567 "uxth %[r0], %[r0] \n\t" /* r0 = a0 */
568
569 "lsrs %[r5], %[r4], #16 \n\t" /* r5 = b1 */
570 "uxth %[r4], %[r4] \n\t" /* r4 = b0 */
571
572 "movs %[r6], %[r3] \n\t" /* r6 = a1 */
573 "muls %[r6], %[r5], %[r6] \n\t" /* r6 = a1 * b1 */
574 "muls %[r3], %[r4], %[r3] \n\t" /* r3 = b0 * a1 */
575 "muls %[r5], %[r0], %[r5] \n\t" /* r5 = a0 * b1 */
576 "muls %[r0], %[r4], %[r0] \n\t" /* r0 = a0 * b0 */
577
578 /* Add middle terms */
579 "lsls %[r4], %[r3], #16 \n\t"
580 "lsrs %[r3], %[r3], #16 \n\t"
581 "adds %[r0], %[r4] \n\t"
582 "adcs %[r6], %[r3] \n\t"
583
584 "lsls %[r4], %[r5], #16 \n\t"
585 "lsrs %[r5], %[r5], #16 \n\t"
586 "adds %[r0], %[r4] \n\t"
587 "adcs %[r6], %[r5] \n\t"
588
589 "mov %[r3], r10\n\t"
590 "mov %[r4], r11\n\t"
591 "mov %[r5], r12\n\t"
592 "adds %[r3], %[r0] \n\t" /* add low word to c0 */
593 "adcs %[r4], %[r6] \n\t" /* add high word to c1, including carry */
594 "movs %[r0], #0 \n\t" /* r0 = 0 (does not affect carry bit) */
595 "adcs %[r5], %[r0] \n\t" /* add carry to c2 */
596
597 "mov %[r6], r14\n\t" /* r6 = k */
598
599 "adds %[r7], #4 \n\t" /* i += 4 */
600 "cmp %[r7], r8 \n\t" /* i > (num_words - 1) (times 4)? */
601 "bgt 4f \n\t" /* if so, exit the loop */
602 "cmp %[r7], %[r6] \n\t" /* i <= k? */
603 "ble 3b \n\t" /* if so, continue looping */
604
605 "4: \n\t" /* end inner loop */
606
607 "ldr %[r0], [sp, #0] \n\t" /* r0 = result */
608
609 "str %[r3], [%[r0], %[r6]] \n\t" /* result[k] = c0 */
610 "mov %[r3], %[r4] \n\t" /* c0 = c1 */
611 "mov %[r4], %[r5] \n\t" /* c1 = c2 */
612 "movs %[r5], #0 \n\t" /* c2 = 0 */
613 "adds %[r6], #4 \n\t" /* k += 4 */
614 "cmp %[r6], r8 \n\t" /* k <= (num_words - 1) (times 4) ? */
615 "ble 1b \n\t" /* if so, loop back, start with i = 0 */
616 "cmp %[r6], r9 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */
617 "ble 2b \n\t" /* if so, loop back, with i = (k + 1) - num_words */
618 /* end outer loop */
619
620 "str %[r3], [%[r0], %[r6]] \n\t" /* result[num_words * 2 - 1] = c0 */
621 "pop {%[r0]} \n\t" /* pop result off the stack */
622
Kazuho Okudaf5f6b2023-02-09 12:23:02 +0900623 RESUME_SYNTAX
Kazuho Oku3aa0c152016-11-01 13:48:56 +0900624 : [r3] "+l" (num_words), [r4] "=&l" (r4),
625 [r5] "=&l" (r5), [r6] "=&l" (r6), [r7] "=&l" (r7)
626 : [r0] "l" (result), [r1] "l" (left), [r2] "l" (right)
627 : "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
628 );
629#endif
630}
631#define asm_mult 1
632#endif
633
634#if uECC_SQUARE_FUNC
635#if !asm_square
636uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
637 const uECC_word_t *left,
638 wordcount_t num_words) {
639#if (uECC_PLATFORM != uECC_arm_thumb)
640 uint32_t c0 = 0;
641 uint32_t c1 = 0;
642 uint32_t c2 = 0;
643 uint32_t k = 0;
644 uint32_t i, tt;
645 uint32_t t0, t1;
646
647 __asm__ volatile (
648 ".syntax unified \n\t"
649
650 "1: \n\t" /* outer loop (k < num_words) */
651 "movs %[i], #0 \n\t" /* i = 0 */
652 "b 3f \n\t"
653
654 "2: \n\t" /* outer loop (k >= num_words) */
655 "movs %[i], %[k] \n\t" /* i = k */
656 "subs %[i], %[last_word] \n\t" /* i = k - (num_words - 1) (times 4) */
657
658 "3: \n\t" /* inner loop */
659 "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */
660
661 "ldr %[t1], [%[left], %[tt]] \n\t" /* t1 = left[k - i] */
662 "ldr %[t0], [%[left], %[i]] \n\t" /* t0 = left[i] */
663
664 "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */
665
666 "cmp %[i], %[tt] \n\t" /* (i < k - i) ? */
667 "bge 4f \n\t" /* if i >= k - i, skip */
668 "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
669 "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
670 "adcs %[c2], %[c2], #0 \n\t" /* add carry to c2 */
671
672 "4: \n\t"
673 "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
674 "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
675 "adcs %[c2], %[c2], #0 \n\t" /* add carry to c2 */
676
677 "adds %[i], #4 \n\t" /* i += 4 */
678 "cmp %[i], %[k] \n\t" /* i >= k? */
679 "bge 5f \n\t" /* if so, exit the loop */
680 "subs %[tt], %[k], %[i] \n\t" /* tt = k - i */
681 "cmp %[i], %[tt] \n\t" /* i <= k - i? */
682 "ble 3b \n\t" /* if so, continue looping */
683
684 "5: \n\t" /* end inner loop */
685
686 "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */
687 "mov %[c0], %[c1] \n\t" /* c0 = c1 */
688 "mov %[c1], %[c2] \n\t" /* c1 = c2 */
689 "movs %[c2], #0 \n\t" /* c2 = 0 */
690 "adds %[k], #4 \n\t" /* k += 4 */
691 "cmp %[k], %[last_word] \n\t" /* k <= (num_words - 1) (times 4) ? */
692 "ble 1b \n\t" /* if so, loop back, start with i = 0 */
693 "cmp %[k], %[last_word], lsl #1 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */
694 "ble 2b \n\t" /* if so, loop back, start with i = (k + 1) - num_words */
695 /* end outer loop */
696
697 "str %[c0], [%[result], %[k]] \n\t" /* result[num_words * 2 - 1] = c0 */
698 RESUME_SYNTAX
699 : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2),
700 [k] "+r" (k), [i] "=&r" (i), [tt] "=&r" (tt), [t0] "=&r" (t0), [t1] "=&r" (t1)
701 : [result] "r" (result), [left] "r" (left), [last_word] "r" ((num_words - 1) * 4)
702 : "cc", "memory"
703 );
704
705#else
706 uint32_t r3, r4, r5, r6, r7;
707
708 __asm__ volatile (
709 ".syntax unified \n\t"
710 "subs %[r2], #1 \n\t" /* r2 = num_words - 1 */
711 "lsls %[r2], #2 \n\t" /* r2 = (num_words - 1) * 4 */
712 "mov r8, %[r2] \n\t" /* r8 = (num_words - 1) * 4 */
713 "lsls %[r2], #1 \n\t" /* r2 = (num_words - 1) * 8 */
714 "mov r9, %[r2] \n\t" /* r9 = (num_words - 1) * 8 */
715 "movs %[r2], #0 \n\t" /* c0 = 0 */
716 "movs %[r3], #0 \n\t" /* c1 = 0 */
717 "movs %[r4], #0 \n\t" /* c2 = 0 */
718 "movs %[r5], #0 \n\t" /* k = 0 */
719
720 "push {%[r0]} \n\t" /* keep result on the stack */
721
722 "1: \n\t" /* outer loop (k < num_words) */
723 "movs %[r6], #0 \n\t" /* r6 = i = 0 */
724 "b 3f \n\t"
725
726 "2: \n\t" /* outer loop (k >= num_words) */
727 "movs %[r6], %[r5] \n\t" /* r6 = k */
728 "mov %[r0], r8 \n\t" /* r0 = (num_words - 1) * 4 */
729 "subs %[r6], %[r0] \n\t" /* r6 = i = k - (num_words - 1) (times 4) */
730
731 "3: \n\t" /* inner loop */
732 "mov r10, %[r2] \n\t"
733 "mov r11, %[r3] \n\t"
734 "mov r12, %[r4] \n\t"
735 "mov r14, %[r5] \n\t"
736 "subs %[r7], %[r5], %[r6] \n\t" /* r7 = k - i */
737
738 "ldr %[r3], [%[r1], %[r7]] \n\t" /* r3 = left[k - i] */
739 "ldr %[r0], [%[r1], %[r6]] \n\t" /* r0 = left[i] */
740
741 "lsrs %[r2], %[r0], #16 \n\t" /* r2 = a1 */
742 "uxth %[r0], %[r0] \n\t" /* r0 = a0 */
743
744 "lsrs %[r4], %[r3], #16 \n\t" /* r4 = b1 */
745 "uxth %[r3], %[r3] \n\t" /* r3 = b0 */
746
747 "movs %[r5], %[r2] \n\t" /* r5 = a1 */
748 "muls %[r5], %[r4], %[r5] \n\t" /* r5 = a1 * b1 */
749 "muls %[r2], %[r3], %[r2] \n\t" /* r2 = b0 * a1 */
750 "muls %[r4], %[r0], %[r4] \n\t" /* r4 = a0 * b1 */
751 "muls %[r0], %[r3], %[r0] \n\t" /* r0 = a0 * b0 */
752
753 /* Add middle terms */
754 "lsls %[r3], %[r2], #16 \n\t"
755 "lsrs %[r2], %[r2], #16 \n\t"
756 "adds %[r0], %[r3] \n\t"
757 "adcs %[r5], %[r2] \n\t"
758
759 "lsls %[r3], %[r4], #16 \n\t"
760 "lsrs %[r4], %[r4], #16 \n\t"
761 "adds %[r0], %[r3] \n\t"
762 "adcs %[r5], %[r4] \n\t"
763
764 /* Add to acc, doubling if necessary */
765 "mov %[r2], r10\n\t"
766 "mov %[r3], r11\n\t"
767 "mov %[r4], r12\n\t"
768
769 "cmp %[r6], %[r7] \n\t" /* (i < k - i) ? */
770 "bge 4f \n\t" /* if i >= k - i, skip */
771 "movs %[r7], #0 \n\t" /* r7 = 0 */
772 "adds %[r2], %[r0] \n\t" /* add low word to c0 */
773 "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */
774 "adcs %[r4], %[r7] \n\t" /* add carry to c2 */
775 "4: \n\t"
776 "movs %[r7], #0 \n\t" /* r7 = 0 */
777 "adds %[r2], %[r0] \n\t" /* add low word to c0 */
778 "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */
779 "adcs %[r4], %[r7] \n\t" /* add carry to c2 */
780
781 "mov %[r5], r14\n\t" /* r5 = k */
782
783 "adds %[r6], #4 \n\t" /* i += 4 */
784 "cmp %[r6], %[r5] \n\t" /* i >= k? */
785 "bge 5f \n\t" /* if so, exit the loop */
786 "subs %[r7], %[r5], %[r6] \n\t" /* r7 = k - i */
787 "cmp %[r6], %[r7] \n\t" /* i <= k - i? */
788 "ble 3b \n\t" /* if so, continue looping */
789
790 "5: \n\t" /* end inner loop */
791
792 "ldr %[r0], [sp, #0] \n\t" /* r0 = result */
793
794 "str %[r2], [%[r0], %[r5]] \n\t" /* result[k] = c0 */
795 "mov %[r2], %[r3] \n\t" /* c0 = c1 */
796 "mov %[r3], %[r4] \n\t" /* c1 = c2 */
797 "movs %[r4], #0 \n\t" /* c2 = 0 */
798 "adds %[r5], #4 \n\t" /* k += 4 */
799 "cmp %[r5], r8 \n\t" /* k <= (num_words - 1) (times 4) ? */
800 "ble 1b \n\t" /* if so, loop back, start with i = 0 */
801 "cmp %[r5], r9 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */
802 "ble 2b \n\t" /* if so, loop back, with i = (k + 1) - num_words */
803 /* end outer loop */
804
805 "str %[r2], [%[r0], %[r5]] \n\t" /* result[num_words * 2 - 1] = c0 */
806 "pop {%[r0]} \n\t" /* pop result off the stack */
807
Kazuho Okudaf5f6b2023-02-09 12:23:02 +0900808 RESUME_SYNTAX
Kazuho Oku3aa0c152016-11-01 13:48:56 +0900809 : [r2] "+l" (num_words), [r3] "=&l" (r3), [r4] "=&l" (r4),
810 [r5] "=&l" (r5), [r6] "=&l" (r6), [r7] "=&l" (r7)
811 : [r0] "l" (result), [r1] "l" (left)
812 : "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
813 );
814#endif
815}
816#define asm_square 1
817#endif
818#endif /* uECC_SQUARE_FUNC */
819
820#endif /* _UECC_ASM_ARM_H_ */