P-256 assembly optimisations for Aarch64.
The ARMv8 assembly code in this commit is mostly taken from OpenSSL's `ecp_nistz256-armv8.pl` at https://github.com/openssl/openssl/blob/19e277dd19f2897f6a7b7eb236abe46655e575bf/crypto/ec/asm/ecp_nistz256-armv8.pl (see Note 1), adapting it to the implementation in p256-x86_64.c.
Most of the assembly functions found in `crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl` required to support that code have their analogous functions in the imported OpenSSL ARMv8 Perl assembly implementation with the exception of the functions:
- ecp_nistz256_select_w5
- ecp_nistz256_select_w7
An implementation for these functions was added.
Summary of modifications to the imported code:
* Renamed to `p256-armv8-asm.pl`
* Modified the location of `arm-xlate.pl` and `arm_arch.h`
* Replaced the `scatter-gather subroutines` with `select subroutines`. The `select subroutines` are implemented for ARMv8 similarly to their x86_64 counterparts, `ecp_nistz256_select_w5` and `ecp_nistz256_select_w7`.
* `ecp_nistz256_add` is removed because it was conflicting during the static build with the function of the same name in p256-nistz.c. The latter calls another assembly function, `ecp_nistz256_point_add`.
* `__ecp_nistz256_add` renamed to `__ecp_nistz256_add_to` to avoid the conflict with the function `ecp_nistz256_add` during the static build.
* l. 924 `add sp,sp,#256` the calculation of the constant, 32*(12-4), is not left for the assembler to perform.
Other modifications:
* `beeu_mod_inverse_vartime()` was implemented for AArch64 in `p256_beeu-armv8-asm.pl` similarly to its implementation in `p256_beeu-x86_64-asm.pl`.
* The files containing `p256-x86_64` in their name were renamed to, `p256-nistz` since the functions and tests defined in them are hereby running on ARMv8 as well, if enabled.
* Updated `delocate.go` and `delocate.peg` to handle the offset calculation in the assembly instructions.
* Regenerated `delocate.peg.go`.
Notes:
1- The last commit in the history of the file is in master only, the previous commits are in OpenSSL 3.0.1
2- This change focuses on AArch64 (64-bit architecture of ARMv8). It does not support ARMv4 or ARMv7.
Testing the performance on Armv8 platform using -DCMAKE_BUILD_TYPE=Release:
Before:
```
Did 2596 ECDH P-256 operations in 1093956us (2373.0 ops/sec)
Did 6996 ECDSA P-256 signing operations in 1044630us (6697.1 ops/sec)
Did 2970 ECDSA P-256 verify operations in 1084848us (2737.7 ops/sec)
```
After:
```
Did 6699 ECDH P-256 operations in 1091684us (6136.4 ops/sec)
Did 20000 ECDSA P-256 signing operations in 1012944us (19744.4 ops/sec)
Did 7051 ECDSA P-256 verify operations in 1060000us (6651.9 ops/sec)
```
Change-Id: I9fdef12db365967a9264b5b32c07967b55ea48bd
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51805
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 312c080..1cb68ca 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -518,7 +518,7 @@
fipsmodule/aes/aes_test.cc
fipsmodule/bn/bn_test.cc
fipsmodule/ec/ec_test.cc
- fipsmodule/ec/p256-x86_64_test.cc
+ fipsmodule/ec/p256-nistz_test.cc
fipsmodule/ecdsa/ecdsa_test.cc
fipsmodule/md5/md5_test.cc
fipsmodule/modes/gcm_test.cc
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 73f8a02..b99ebc7 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -64,6 +64,8 @@
armv8-mont.${ASM_EXT}
ghash-neon-armv8.${ASM_EXT}
ghashv8-armx.${ASM_EXT}
+ p256-armv8-asm.${ASM_EXT}
+ p256_beeu-armv8-asm.${ASM_EXT}
sha1-armv8.${ASM_EXT}
sha256-armv8.${ASM_EXT}
sha512-armv8.${ASM_EXT}
@@ -102,6 +104,8 @@
perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl)
perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl)
perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl)
+perlasm(p256-armv8-asm.${ASM_EXT} ec/asm/p256-armv8-asm.pl)
+perlasm(p256_beeu-armv8-asm.${ASM_EXT} ec/asm/p256_beeu-armv8-asm.pl)
perlasm(rdrand-x86_64.${ASM_EXT} rand/asm/rdrand-x86_64.pl)
perlasm(rsaz-avx2.${ASM_EXT} bn/asm/rsaz-avx2.pl)
perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl)
diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c
index 6f8f5c0..87618fe 100644
--- a/crypto/fipsmodule/bcm.c
+++ b/crypto/fipsmodule/bcm.c
@@ -71,7 +71,7 @@
#include "ec/oct.c"
#include "ec/p224-64.c"
#include "ec/p256.c"
-#include "ec/p256-x86_64.c"
+#include "ec/p256-nistz.c"
#include "ec/scalar.c"
#include "ec/simple.c"
#include "ec/simple_mul.c"
diff --git a/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
new file mode 100644
index 0000000..f2926b8
--- /dev/null
+++ b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
@@ -0,0 +1,1702 @@
+#! /usr/bin/env perl
+# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# ECP_NISTZ256 module for ARMv8.
+#
+# February 2015.
+#
+# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
+# http://eprint.iacr.org/2013/816.
+#
+# with/without -DECP_NISTZ256_ASM
+# Apple A7 +190-360%
+# Cortex-A53 +190-400%
+# Cortex-A57 +190-350%
+# Denver +230-400%
+#
+# Ranges denote minimum and maximum improvement coefficients depending
+# on benchmark. Lower coefficients are for ECDSA sign, server-side
+# operation. Keep in mind that +400% means 5x improvement.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+ Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+{
+my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
+ $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
+ map("x$_",(0..17,19,20));
+
+my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
+
+$code.=<<___;
+#include "openssl/arm_arch.h"
+
+.text
+.align 5
+.Lpoly:
+.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+.LRR: // 2^512 mod P precomputed for NIST P256 polynomial
+.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+.Lone_mont:
+.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+.Lone:
+.quad 1,0,0,0
+.Lord:
+.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
+.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+
+// void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_to_mont
+.type ecp_nistz256_to_mont,%function
+.align 6
+ecp_nistz256_to_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldr $bi,.LRR // bp[0]
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+ adr $bp,.LRR // &bp[0]
+
+ bl __ecp_nistz256_mul_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
+
+// void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_from_mont
+.type ecp_nistz256_from_mont,%function
+.align 4
+ecp_nistz256_from_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ mov $bi,#1 // bp[0]
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+ adr $bp,.Lone // &bp[0]
+
+ bl __ecp_nistz256_mul_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
+
+// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl ecp_nistz256_mul_mont
+.type ecp_nistz256_mul_mont,%function
+.align 4
+ecp_nistz256_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldr $bi,[$bp] // bp[0]
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+
+ bl __ecp_nistz256_mul_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_sqr_mont
+.type ecp_nistz256_sqr_mont,%function
+.align 4
+ecp_nistz256_sqr_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+
+ bl __ecp_nistz256_sqr_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_div_by_2
+.type ecp_nistz256_div_by_2,%function
+.align 4
+ecp_nistz256_div_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp $acc0,$acc1,[$ap]
+ ldp $acc2,$acc3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+
+ bl __ecp_nistz256_div_by_2
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_mul_by_2
+.type ecp_nistz256_mul_by_2,%function
+.align 4
+ecp_nistz256_mul_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp $acc0,$acc1,[$ap]
+ ldp $acc2,$acc3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+ mov $t0,$acc0
+ mov $t1,$acc1
+ mov $t2,$acc2
+ mov $t3,$acc3
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_mul_by_3
+.type ecp_nistz256_mul_by_3,%function
+.align 4
+ecp_nistz256_mul_by_3:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp $acc0,$acc1,[$ap]
+ ldp $acc2,$acc3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+ mov $t0,$acc0
+ mov $t1,$acc1
+ mov $t2,$acc2
+ mov $t3,$acc3
+ mov $a0,$acc0
+ mov $a1,$acc1
+ mov $a2,$acc2
+ mov $a3,$acc3
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ mov $t0,$a0
+ mov $t1,$a1
+ mov $t2,$a2
+ mov $t3,$a3
+
+ bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl ecp_nistz256_sub
+.type ecp_nistz256_sub,%function
+.align 4
+ecp_nistz256_sub:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp $acc0,$acc1,[$ap]
+ ldp $acc2,$acc3,[$ap,#16]
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_sub,.-ecp_nistz256_sub
+
+// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_neg
+.type ecp_nistz256_neg,%function
+.align 4
+ecp_nistz256_neg:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov $bp,$ap
+ mov $acc0,xzr // a = 0
+ mov $acc1,xzr
+ mov $acc2,xzr
+ mov $acc3,xzr
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_neg,.-ecp_nistz256_neg
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to $a0-$a3 and b[0] - to $bi
+.type __ecp_nistz256_mul_mont,%function
+.align 4
+__ecp_nistz256_mul_mont:
+ mul $acc0,$a0,$bi // a[0]*b[0]
+ umulh $t0,$a0,$bi
+
+ mul $acc1,$a1,$bi // a[1]*b[0]
+ umulh $t1,$a1,$bi
+
+ mul $acc2,$a2,$bi // a[2]*b[0]
+ umulh $t2,$a2,$bi
+
+ mul $acc3,$a3,$bi // a[3]*b[0]
+ umulh $t3,$a3,$bi
+ ldr $bi,[$bp,#8] // b[1]
+
+ adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
+ lsl $t0,$acc0,#32
+ adcs $acc2,$acc2,$t1
+ lsr $t1,$acc0,#32
+ adcs $acc3,$acc3,$t2
+ adc $acc4,xzr,$t3
+ mov $acc5,xzr
+___
+for($i=1;$i<4;$i++) {
+ # Reduction iteration is normally performed by accumulating
+ # result of multiplication of modulus by "magic" digit [and
+ # omitting least significant word, which is guaranteed to
+ # be 0], but thanks to special form of modulus and "magic"
+ # digit being equal to least significant word, it can be
+ # performed with additions and subtractions alone. Indeed:
+ #
+ # ffff0001.00000000.0000ffff.ffffffff
+ # * abcdefgh
+ # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
+ #
+ # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+ # rewrite above as:
+ #
+ # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
+ # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
+ # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
+ #
+ # or marking redundant operations:
+ #
+ # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
+ # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
+ # - 0000abcd.efgh0000.--------.--------.--------
+
+$code.=<<___;
+ subs $t2,$acc0,$t0 // "*0xffff0001"
+ sbc $t3,$acc0,$t1
+ adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
+ mul $t0,$a0,$bi // lo(a[0]*b[i])
+ adcs $acc1,$acc2,$t1
+ mul $t1,$a1,$bi // lo(a[1]*b[i])
+ adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
+ mul $t2,$a2,$bi // lo(a[2]*b[i])
+ adcs $acc3,$acc4,$t3
+ mul $t3,$a3,$bi // lo(a[3]*b[i])
+ adc $acc4,$acc5,xzr
+
+ adds $acc0,$acc0,$t0 // accumulate low parts of multiplication
+ umulh $t0,$a0,$bi // hi(a[0]*b[i])
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$a1,$bi // hi(a[1]*b[i])
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$a2,$bi // hi(a[2]*b[i])
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$a3,$bi // hi(a[3]*b[i])
+ adc $acc4,$acc4,xzr
+___
+$code.=<<___ if ($i<3);
+ ldr $bi,[$bp,#8*($i+1)] // b[$i+1]
+___
+$code.=<<___;
+ adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
+ lsl $t0,$acc0,#32
+ adcs $acc2,$acc2,$t1
+ lsr $t1,$acc0,#32
+ adcs $acc3,$acc3,$t2
+ adcs $acc4,$acc4,$t3
+ adc $acc5,xzr,xzr
+___
+}
+$code.=<<___;
+ // last reduction
+ subs $t2,$acc0,$t0 // "*0xffff0001"
+ sbc $t3,$acc0,$t1
+ adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
+ adcs $acc1,$acc2,$t1
+ adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
+ adcs $acc3,$acc4,$t3
+ adc $acc4,$acc5,xzr
+
+ adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
+ sbcs $t1,$acc1,$poly1
+ sbcs $t2,$acc2,xzr
+ sbcs $t3,$acc3,$poly3
+ sbcs xzr,$acc4,xzr // did it borrow?
+
+ csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $acc1,$acc1,$t1,lo
+ csel $acc2,$acc2,$t2,lo
+ stp $acc0,$acc1,[$rp]
+ csel $acc3,$acc3,$t3,lo
+ stp $acc2,$acc3,[$rp,#16]
+
+ ret
+.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to $a0-$a3
+.type __ecp_nistz256_sqr_mont,%function
+.align 4
+__ecp_nistz256_sqr_mont:
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul $acc1,$a1,$a0 // a[1]*a[0]
+ umulh $t1,$a1,$a0
+ mul $acc2,$a2,$a0 // a[2]*a[0]
+ umulh $t2,$a2,$a0
+ mul $acc3,$a3,$a0 // a[3]*a[0]
+ umulh $acc4,$a3,$a0
+
+ adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
+ mul $t0,$a2,$a1 // a[2]*a[1]
+ umulh $t1,$a2,$a1
+ adcs $acc3,$acc3,$t2
+ mul $t2,$a3,$a1 // a[3]*a[1]
+ umulh $t3,$a3,$a1
+ adc $acc4,$acc4,xzr // can't overflow
+
+ mul $acc5,$a3,$a2 // a[3]*a[2]
+ umulh $acc6,$a3,$a2
+
+ adds $t1,$t1,$t2 // accumulate high parts of multiplication
+ mul $acc0,$a0,$a0 // a[0]*a[0]
+ adc $t2,$t3,xzr // can't overflow
+
+ adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
+ umulh $a0,$a0,$a0
+ adcs $acc4,$acc4,$t1
+ mul $t1,$a1,$a1 // a[1]*a[1]
+ adcs $acc5,$acc5,$t2
+ umulh $a1,$a1,$a1
+ adc $acc6,$acc6,xzr // can't overflow
+
+ adds $acc1,$acc1,$acc1 // acc[1-6]*=2
+ mul $t2,$a2,$a2 // a[2]*a[2]
+ adcs $acc2,$acc2,$acc2
+ umulh $a2,$a2,$a2
+ adcs $acc3,$acc3,$acc3
+ mul $t3,$a3,$a3 // a[3]*a[3]
+ adcs $acc4,$acc4,$acc4
+ umulh $a3,$a3,$a3
+ adcs $acc5,$acc5,$acc5
+ adcs $acc6,$acc6,$acc6
+ adc $acc7,xzr,xzr
+
+ adds $acc1,$acc1,$a0 // +a[i]*a[i]
+ adcs $acc2,$acc2,$t1
+ adcs $acc3,$acc3,$a1
+ adcs $acc4,$acc4,$t2
+ adcs $acc5,$acc5,$a2
+ lsl $t0,$acc0,#32
+ adcs $acc6,$acc6,$t3
+ lsr $t1,$acc0,#32
+ adc $acc7,$acc7,$a3
+___
+for($i=0;$i<3;$i++) { # reductions, see commentary in
+ # multiplication for details
+$code.=<<___;
+ subs $t2,$acc0,$t0 // "*0xffff0001"
+ sbc $t3,$acc0,$t1
+ adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
+ adcs $acc1,$acc2,$t1
+ lsl $t0,$acc0,#32
+ adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
+ lsr $t1,$acc0,#32
+ adc $acc3,$t3,xzr // can't overflow
+___
+}
+$code.=<<___;
+ subs $t2,$acc0,$t0 // "*0xffff0001"
+ sbc $t3,$acc0,$t1
+ adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
+ adcs $acc1,$acc2,$t1
+ adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
+ adc $acc3,$t3,xzr // can't overflow
+
+ adds $acc0,$acc0,$acc4 // accumulate upper half
+ adcs $acc1,$acc1,$acc5
+ adcs $acc2,$acc2,$acc6
+ adcs $acc3,$acc3,$acc7
+ adc $acc4,xzr,xzr
+
+ adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
+ sbcs $t1,$acc1,$poly1
+ sbcs $t2,$acc2,xzr
+ sbcs $t3,$acc3,$poly3
+ sbcs xzr,$acc4,xzr // did it borrow?
+
+ csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $acc1,$acc1,$t1,lo
+ csel $acc2,$acc2,$t2,lo
+ stp $acc0,$acc1,[$rp]
+ csel $acc3,$acc3,$t3,lo
+ stp $acc2,$acc3,[$rp,#16]
+
+ ret
+.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.type __ecp_nistz256_add_to,%function
+.align 4
+__ecp_nistz256_add_to:
+ adds $acc0,$acc0,$t0 // ret = a+b
+ adcs $acc1,$acc1,$t1
+ adcs $acc2,$acc2,$t2
+ adcs $acc3,$acc3,$t3
+ adc $ap,xzr,xzr // zap $ap
+
+ adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
+ sbcs $t1,$acc1,$poly1
+ sbcs $t2,$acc2,xzr
+ sbcs $t3,$acc3,$poly3
+ sbcs xzr,$ap,xzr // did subtraction borrow?
+
+ csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $acc1,$acc1,$t1,lo
+ csel $acc2,$acc2,$t2,lo
+ stp $acc0,$acc1,[$rp]
+ csel $acc3,$acc3,$t3,lo
+ stp $acc2,$acc3,[$rp,#16]
+
+ ret
+.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to
+
+.type __ecp_nistz256_sub_from,%function
+.align 4
+__ecp_nistz256_sub_from:
+ ldp $t0,$t1,[$bp]
+ ldp $t2,$t3,[$bp,#16]
+ subs $acc0,$acc0,$t0 // ret = a-b
+ sbcs $acc1,$acc1,$t1
+ sbcs $acc2,$acc2,$t2
+ sbcs $acc3,$acc3,$t3
+ sbc $ap,xzr,xzr // zap $ap
+
+ subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
+ adcs $t1,$acc1,$poly1
+ adcs $t2,$acc2,xzr
+ adc $t3,$acc3,$poly3
+ cmp $ap,xzr // did subtraction borrow?
+
+ csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
+ csel $acc1,$acc1,$t1,eq
+ csel $acc2,$acc2,$t2,eq
+ stp $acc0,$acc1,[$rp]
+ csel $acc3,$acc3,$t3,eq
+ stp $acc2,$acc3,[$rp,#16]
+
+ ret
+.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
+
+.type __ecp_nistz256_sub_morf,%function
+.align 4
+__ecp_nistz256_sub_morf:
+ ldp $t0,$t1,[$bp]
+ ldp $t2,$t3,[$bp,#16]
+ subs $acc0,$t0,$acc0 // ret = b-a
+ sbcs $acc1,$t1,$acc1
+ sbcs $acc2,$t2,$acc2
+ sbcs $acc3,$t3,$acc3
+ sbc $ap,xzr,xzr // zap $ap
+
+ subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
+ adcs $t1,$acc1,$poly1
+ adcs $t2,$acc2,xzr
+ adc $t3,$acc3,$poly3
+ cmp $ap,xzr // did subtraction borrow?
+
+ csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
+ csel $acc1,$acc1,$t1,eq
+ csel $acc2,$acc2,$t2,eq
+ stp $acc0,$acc1,[$rp]
+ csel $acc3,$acc3,$t3,eq
+ stp $acc2,$acc3,[$rp,#16]
+
+ ret
+.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
+
+.type __ecp_nistz256_div_by_2,%function
+.align 4
+__ecp_nistz256_div_by_2:
+ subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
+ adcs $t1,$acc1,$poly1
+ adcs $t2,$acc2,xzr
+ adcs $t3,$acc3,$poly3
+ adc $ap,xzr,xzr // zap $ap
+ tst $acc0,#1 // is a even?
+
+ csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
+ csel $acc1,$acc1,$t1,eq
+ csel $acc2,$acc2,$t2,eq
+ csel $acc3,$acc3,$t3,eq
+ csel $ap,xzr,$ap,eq
+
+ lsr $acc0,$acc0,#1 // ret >>= 1
+ orr $acc0,$acc0,$acc1,lsl#63
+ lsr $acc1,$acc1,#1
+ orr $acc1,$acc1,$acc2,lsl#63
+ lsr $acc2,$acc2,#1
+ orr $acc2,$acc2,$acc3,lsl#63
+ lsr $acc3,$acc3,#1
+ stp $acc0,$acc1,[$rp]
+ orr $acc3,$acc3,$ap,lsl#63
+ stp $acc2,$acc3,[$rp,#16]
+
+ ret
+.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
+___
+########################################################################
+# following subroutines are "literal" implementation of those found in
+# ecp_nistz256.c
+#
+########################################################################
+# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
+#
+{
+my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
+# above map() describes stack layout with 4 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real) = map("x$_",(21,22));
+
+$code.=<<___;
+.globl ecp_nistz256_point_double
+.type ecp_nistz256_point_double,%function
+.align 5
+ecp_nistz256_point_double:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ sub sp,sp,#32*4
+
+.Ldouble_shortcut:
+ ldp $acc0,$acc1,[$ap,#32]
+ mov $rp_real,$rp
+ ldp $acc2,$acc3,[$ap,#48]
+ mov $ap_real,$ap
+ ldr $poly1,.Lpoly+8
+ mov $t0,$acc0
+ ldr $poly3,.Lpoly+24
+ mov $t1,$acc1
+ ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
+ mov $t2,$acc2
+ mov $t3,$acc3
+ ldp $a2,$a3,[$ap_real,#64+16]
+ add $rp,sp,#$S
+ bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y);
+
+ add $rp,sp,#$Zsqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
+
+ ldp $t0,$t1,[$ap_real]
+ ldp $t2,$t3,[$ap_real,#16]
+ mov $a0,$acc0 // put Zsqr aside for p256_sub
+ mov $a1,$acc1
+ mov $a2,$acc2
+ mov $a3,$acc3
+ add $rp,sp,#$M
+ bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x);
+
+ add $bp,$ap_real,#0
+ mov $acc0,$a0 // restore Zsqr
+ mov $acc1,$a1
+ ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
+ mov $acc2,$a2
+ mov $acc3,$a3
+ ldp $a2,$a3,[sp,#$S+16]
+ add $rp,sp,#$Zsqr
+ bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
+
+ add $rp,sp,#$S
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
+
+ ldr $bi,[$ap_real,#32]
+ ldp $a0,$a1,[$ap_real,#64]
+ ldp $a2,$a3,[$ap_real,#64+16]
+ add $bp,$ap_real,#32
+ add $rp,sp,#$tmp0
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
+
+ mov $t0,$acc0
+ mov $t1,$acc1
+ ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
+ mov $t2,$acc2
+ mov $t3,$acc3
+ ldp $a2,$a3,[sp,#$S+16]
+ add $rp,$rp_real,#64
+ bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0);
+
+ add $rp,sp,#$tmp0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
+
+ ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
+ ldp $a0,$a1,[sp,#$M]
+ ldp $a2,$a3,[sp,#$M+16]
+ add $rp,$rp_real,#32
+ bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
+
+ add $bp,sp,#$Zsqr
+ add $rp,sp,#$M
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
+
+ mov $t0,$acc0 // duplicate M
+ mov $t1,$acc1
+ mov $t2,$acc2
+ mov $t3,$acc3
+ mov $a0,$acc0 // put M aside
+ mov $a1,$acc1
+ mov $a2,$acc2
+ mov $a3,$acc3
+ add $rp,sp,#$M
+ bl __ecp_nistz256_add_to
+ mov $t0,$a0 // restore M
+ mov $t1,$a1
+ ldr $bi,[$ap_real] // forward load for p256_mul_mont
+ mov $t2,$a2
+ ldp $a0,$a1,[sp,#$S]
+ mov $t3,$a3
+ ldp $a2,$a3,[sp,#$S+16]
+ bl __ecp_nistz256_add_to // p256_mul_by_3(M, M);
+
+ add $bp,$ap_real,#0
+ add $rp,sp,#$S
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
+
+ mov $t0,$acc0
+ mov $t1,$acc1
+ ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
+ mov $t2,$acc2
+ mov $t3,$acc3
+ ldp $a2,$a3,[sp,#$M+16]
+ add $rp,sp,#$tmp0
+ bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S);
+
+ add $rp,$rp_real,#0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
+
+ add $bp,sp,#$tmp0
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
+
+ add $bp,sp,#$S
+ add $rp,sp,#$S
+ bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
+
+ ldr $bi,[sp,#$M]
+ mov $a0,$acc0 // copy S
+ mov $a1,$acc1
+ mov $a2,$acc2
+ mov $a3,$acc3
+ add $bp,sp,#$M
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
+
+ add $bp,$rp_real,#32
+ add $rp,$rp_real,#32
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
+___
+}
+
+########################################################################
+# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
+# const P256_POINT *in2);
+{
+my ($res_x,$res_y,$res_z,
+ $H,$Hsqr,$R,$Rsqr,$Hcub,
+ $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
+my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
+# above map() describes stack layout with 12 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
+
+$code.=<<___;
+.globl ecp_nistz256_point_add
+.type ecp_nistz256_point_add,%function
+.align 5
+ecp_nistz256_point_add:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#32*12
+
+ ldp $a0,$a1,[$bp,#64] // in2_z
+ ldp $a2,$a3,[$bp,#64+16]
+ mov $rp_real,$rp
+ mov $ap_real,$ap
+ mov $bp_real,$bp
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+ orr $t0,$a0,$a1
+ orr $t2,$a2,$a3
+ orr $in2infty,$t0,$t2
+ cmp $in2infty,#0
+ csetm $in2infty,ne // ~in2infty
+ add $rp,sp,#$Z2sqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
+
+ ldp $a0,$a1,[$ap_real,#64] // in1_z
+ ldp $a2,$a3,[$ap_real,#64+16]
+ orr $t0,$a0,$a1
+ orr $t2,$a2,$a3
+ orr $in1infty,$t0,$t2
+ cmp $in1infty,#0
+ csetm $in1infty,ne // ~in1infty
+ add $rp,sp,#$Z1sqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ ldr $bi,[$bp_real,#64]
+ ldp $a0,$a1,[sp,#$Z2sqr]
+ ldp $a2,$a3,[sp,#$Z2sqr+16]
+ add $bp,$bp_real,#64
+ add $rp,sp,#$S1
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
+
+ ldr $bi,[$ap_real,#64]
+ ldp $a0,$a1,[sp,#$Z1sqr]
+ ldp $a2,$a3,[sp,#$Z1sqr+16]
+ add $bp,$ap_real,#64
+ add $rp,sp,#$S2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr $bi,[$ap_real,#32]
+ ldp $a0,$a1,[sp,#$S1]
+ ldp $a2,$a3,[sp,#$S1+16]
+ add $bp,$ap_real,#32
+ add $rp,sp,#$S1
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
+
+ ldr $bi,[$bp_real,#32]
+ ldp $a0,$a1,[sp,#$S2]
+ ldp $a2,$a3,[sp,#$S2+16]
+ add $bp,$bp_real,#32
+ add $rp,sp,#$S2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add $bp,sp,#$S1
+ ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont
+ ldp $a0,$a1,[$ap_real]
+ ldp $a2,$a3,[$ap_real,#16]
+ add $rp,sp,#$R
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
+
+ orr $acc0,$acc0,$acc1 // see if result is zero
+ orr $acc2,$acc2,$acc3
+ orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2)
+
+ add $bp,sp,#$Z2sqr
+ add $rp,sp,#$U1
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
+
+ ldr $bi,[sp,#$Z1sqr]
+ ldp $a0,$a1,[$bp_real]
+ ldp $a2,$a3,[$bp_real,#16]
+ add $bp,sp,#$Z1sqr
+ add $rp,sp,#$U2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
+
+ add $bp,sp,#$U1
+ ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont
+ ldp $a2,$a3,[sp,#$R+16]
+ add $rp,sp,#$H
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
+
+ orr $acc0,$acc0,$acc1 // see if result is zero
+ orr $acc2,$acc2,$acc3
+ orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2)
+
+ mvn $temp1,$in1infty // -1/0 -> 0/-1
+ mvn $temp2,$in2infty // -1/0 -> 0/-1
+ orr $acc0,$acc0,$temp1
+ orr $acc0,$acc0,$temp2
+ orr $acc0,$acc0,$temp0
+ cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+.Ladd_double:
+ mov $ap,$ap_real
+ mov $rp,$rp_real
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames
+ b .Ldouble_shortcut
+
+.align 4
+.Ladd_proceed:
+ add $rp,sp,#$Rsqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr $bi,[$ap_real,#64]
+ ldp $a0,$a1,[sp,#$H]
+ ldp $a2,$a3,[sp,#$H+16]
+ add $bp,$ap_real,#64
+ add $rp,sp,#$res_z
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldp $a0,$a1,[sp,#$H]
+ ldp $a2,$a3,[sp,#$H+16]
+ add $rp,sp,#$Hsqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldr $bi,[$bp_real,#64]
+ ldp $a0,$a1,[sp,#$res_z]
+ ldp $a2,$a3,[sp,#$res_z+16]
+ add $bp,$bp_real,#64
+ add $rp,sp,#$res_z
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
+
+ ldr $bi,[sp,#$H]
+ ldp $a0,$a1,[sp,#$Hsqr]
+ ldp $a2,$a3,[sp,#$Hsqr+16]
+ add $bp,sp,#$H
+ add $rp,sp,#$Hcub
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr $bi,[sp,#$Hsqr]
+ ldp $a0,$a1,[sp,#$U1]
+ ldp $a2,$a3,[sp,#$U1+16]
+ add $bp,sp,#$Hsqr
+ add $rp,sp,#$U2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
+
+ mov $t0,$acc0
+ mov $t1,$acc1
+ mov $t2,$acc2
+ mov $t3,$acc3
+ add $rp,sp,#$Hsqr
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add $bp,sp,#$Rsqr
+ add $rp,sp,#$res_x
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add $bp,sp,#$Hcub
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add $bp,sp,#$U2
+ ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont
+ ldp $a0,$a1,[sp,#$S1]
+ ldp $a2,$a3,[sp,#$S1+16]
+ add $rp,sp,#$res_y
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add $bp,sp,#$Hcub
+ add $rp,sp,#$S2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
+
+ ldr $bi,[sp,#$R]
+ ldp $a0,$a1,[sp,#$res_y]
+ ldp $a2,$a3,[sp,#$res_y+16]
+ add $bp,sp,#$R
+ add $rp,sp,#$res_y
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add $bp,sp,#$S2
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp $a0,$a1,[sp,#$res_x] // res
+ ldp $a2,$a3,[sp,#$res_x+16]
+ ldp $t0,$t1,[$bp_real] // in2
+ ldp $t2,$t3,[$bp_real,#16]
+___
+for($i=0;$i<64;$i+=32) { # conditional moves
+$code.=<<___;
+ ldp $acc0,$acc1,[$ap_real,#$i] // in1
+ cmp $in1infty,#0 // ~$in1intfy, remember?
+ ldp $acc2,$acc3,[$ap_real,#$i+16]
+ csel $t0,$a0,$t0,ne
+ csel $t1,$a1,$t1,ne
+ ldp $a0,$a1,[sp,#$res_x+$i+32] // res
+ csel $t2,$a2,$t2,ne
+ csel $t3,$a3,$t3,ne
+ cmp $in2infty,#0 // ~$in2intfy, remember?
+ ldp $a2,$a3,[sp,#$res_x+$i+48]
+ csel $acc0,$t0,$acc0,ne
+ csel $acc1,$t1,$acc1,ne
+ ldp $t0,$t1,[$bp_real,#$i+32] // in2
+ csel $acc2,$t2,$acc2,ne
+ csel $acc3,$t3,$acc3,ne
+ ldp $t2,$t3,[$bp_real,#$i+48]
+ stp $acc0,$acc1,[$rp_real,#$i]
+ stp $acc2,$acc3,[$rp_real,#$i+16]
+___
+}
+$code.=<<___;
+ ldp $acc0,$acc1,[$ap_real,#$i] // in1
+ cmp $in1infty,#0 // ~$in1intfy, remember?
+ ldp $acc2,$acc3,[$ap_real,#$i+16]
+ csel $t0,$a0,$t0,ne
+ csel $t1,$a1,$t1,ne
+ csel $t2,$a2,$t2,ne
+ csel $t3,$a3,$t3,ne
+ cmp $in2infty,#0 // ~$in2intfy, remember?
+ csel $acc0,$t0,$acc0,ne
+ csel $acc1,$t1,$acc1,ne
+ csel $acc2,$t2,$acc2,ne
+ csel $acc3,$t3,$acc3,ne
+ stp $acc0,$acc1,[$rp_real,#$i]
+ stp $acc2,$acc3,[$rp_real,#$i+16]
+
+.Ladd_done:
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
+___
+}
+
+########################################################################
+# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
+# const P256_POINT_AFFINE *in2);
+{
+my ($res_x,$res_y,$res_z,
+ $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
+my $Z1sqr = $S2;
+# above map() describes stack layout with 10 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
+
+$code.=<<___;
+.globl ecp_nistz256_point_add_affine
+.type ecp_nistz256_point_add_affine,%function
+.align 5
+ecp_nistz256_point_add_affine:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ sub sp,sp,#32*10
+
+ mov $rp_real,$rp
+ mov $ap_real,$ap
+ mov $bp_real,$bp
+ ldr $poly1,.Lpoly+8
+ ldr $poly3,.Lpoly+24
+
+ ldp $a0,$a1,[$ap,#64] // in1_z
+ ldp $a2,$a3,[$ap,#64+16]
+ orr $t0,$a0,$a1
+ orr $t2,$a2,$a3
+ orr $in1infty,$t0,$t2
+ cmp $in1infty,#0
+ csetm $in1infty,ne // ~in1infty
+
+ ldp $acc0,$acc1,[$bp] // in2_x
+ ldp $acc2,$acc3,[$bp,#16]
+ ldp $t0,$t1,[$bp,#32] // in2_y
+ ldp $t2,$t3,[$bp,#48]
+ orr $acc0,$acc0,$acc1
+ orr $acc2,$acc2,$acc3
+ orr $t0,$t0,$t1
+ orr $t2,$t2,$t3
+ orr $acc0,$acc0,$acc2
+ orr $t0,$t0,$t2
+ orr $in2infty,$acc0,$t0
+ cmp $in2infty,#0
+ csetm $in2infty,ne // ~in2infty
+
+ add $rp,sp,#$Z1sqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ mov $a0,$acc0
+ mov $a1,$acc1
+ mov $a2,$acc2
+ mov $a3,$acc3
+ ldr $bi,[$bp_real]
+ add $bp,$bp_real,#0
+ add $rp,sp,#$U2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
+
+ add $bp,$ap_real,#0
+ ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
+ ldp $a0,$a1,[sp,#$Z1sqr]
+ ldp $a2,$a3,[sp,#$Z1sqr+16]
+ add $rp,sp,#$H
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
+
+ add $bp,$ap_real,#64
+ add $rp,sp,#$S2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr $bi,[$ap_real,#64]
+ ldp $a0,$a1,[sp,#$H]
+ ldp $a2,$a3,[sp,#$H+16]
+ add $bp,$ap_real,#64
+ add $rp,sp,#$res_z
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldr $bi,[$bp_real,#32]
+ ldp $a0,$a1,[sp,#$S2]
+ ldp $a2,$a3,[sp,#$S2+16]
+ add $bp,$bp_real,#32
+ add $rp,sp,#$S2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add $bp,$ap_real,#32
+ ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
+ ldp $a2,$a3,[sp,#$H+16]
+ add $rp,sp,#$R
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
+
+ add $rp,sp,#$Hsqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldp $a0,$a1,[sp,#$R]
+ ldp $a2,$a3,[sp,#$R+16]
+ add $rp,sp,#$Rsqr
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr $bi,[sp,#$H]
+ ldp $a0,$a1,[sp,#$Hsqr]
+ ldp $a2,$a3,[sp,#$Hsqr+16]
+ add $bp,sp,#$H
+ add $rp,sp,#$Hcub
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr $bi,[$ap_real]
+ ldp $a0,$a1,[sp,#$Hsqr]
+ ldp $a2,$a3,[sp,#$Hsqr+16]
+ add $bp,$ap_real,#0
+ add $rp,sp,#$U2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
+
+ mov $t0,$acc0
+ mov $t1,$acc1
+ mov $t2,$acc2
+ mov $t3,$acc3
+ add $rp,sp,#$Hsqr
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add $bp,sp,#$Rsqr
+ add $rp,sp,#$res_x
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add $bp,sp,#$Hcub
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add $bp,sp,#$U2
+ ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
+ ldp $a0,$a1,[sp,#$Hcub]
+ ldp $a2,$a3,[sp,#$Hcub+16]
+ add $rp,sp,#$res_y
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add $bp,$ap_real,#32
+ add $rp,sp,#$S2
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
+
+ ldr $bi,[sp,#$R]
+ ldp $a0,$a1,[sp,#$res_y]
+ ldp $a2,$a3,[sp,#$res_y+16]
+ add $bp,sp,#$R
+ add $rp,sp,#$res_y
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add $bp,sp,#$S2
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp $a0,$a1,[sp,#$res_x] // res
+ ldp $a2,$a3,[sp,#$res_x+16]
+ ldp $t0,$t1,[$bp_real] // in2
+ ldp $t2,$t3,[$bp_real,#16]
+___
+for($i=0;$i<64;$i+=32) { # conditional moves
+$code.=<<___;
+ ldp $acc0,$acc1,[$ap_real,#$i] // in1
+ cmp $in1infty,#0 // ~$in1intfy, remember?
+ ldp $acc2,$acc3,[$ap_real,#$i+16]
+ csel $t0,$a0,$t0,ne
+ csel $t1,$a1,$t1,ne
+ ldp $a0,$a1,[sp,#$res_x+$i+32] // res
+ csel $t2,$a2,$t2,ne
+ csel $t3,$a3,$t3,ne
+ cmp $in2infty,#0 // ~$in2intfy, remember?
+ ldp $a2,$a3,[sp,#$res_x+$i+48]
+ csel $acc0,$t0,$acc0,ne
+ csel $acc1,$t1,$acc1,ne
+ ldp $t0,$t1,[$bp_real,#$i+32] // in2
+ csel $acc2,$t2,$acc2,ne
+ csel $acc3,$t3,$acc3,ne
+ ldp $t2,$t3,[$bp_real,#$i+48]
+ stp $acc0,$acc1,[$rp_real,#$i]
+ stp $acc2,$acc3,[$rp_real,#$i+16]
+___
+$code.=<<___ if ($i == 0);
+ adr $bp_real,.Lone_mont-64
+___
+}
+$code.=<<___;
+ ldp $acc0,$acc1,[$ap_real,#$i] // in1
+ cmp $in1infty,#0 // ~$in1intfy, remember?
+ ldp $acc2,$acc3,[$ap_real,#$i+16]
+ csel $t0,$a0,$t0,ne
+ csel $t1,$a1,$t1,ne
+ csel $t2,$a2,$t2,ne
+ csel $t3,$a3,$t3,ne
+ cmp $in2infty,#0 // ~$in2intfy, remember?
+ csel $acc0,$t0,$acc0,ne
+ csel $acc1,$t1,$acc1,ne
+ csel $acc2,$t2,$acc2,ne
+ csel $acc3,$t3,$acc3,ne
+ stp $acc0,$acc1,[$rp_real,#$i]
+ stp $acc2,$acc3,[$rp_real,#$i+16]
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x29,x30,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+___
+}
+if (1) {
+my ($ord0,$ord1) = ($poly1,$poly3);
+my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
+my $acc7 = $bi;
+
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t b[4]);
+.globl ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,%function
+.align 4
+ecp_nistz256_ord_mul_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adr $ordk,.Lord
+ ldr $bi,[$bp] // bp[0]
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+
+ ldp $ord0,$ord1,[$ordk,#0]
+ ldp $ord2,$ord3,[$ordk,#16]
+ ldr $ordk,[$ordk,#32]
+
+ mul $acc0,$a0,$bi // a[0]*b[0]
+ umulh $t0,$a0,$bi
+
+ mul $acc1,$a1,$bi // a[1]*b[0]
+ umulh $t1,$a1,$bi
+
+ mul $acc2,$a2,$bi // a[2]*b[0]
+ umulh $t2,$a2,$bi
+
+ mul $acc3,$a3,$bi // a[3]*b[0]
+ umulh $acc4,$a3,$bi
+
+ mul $t4,$acc0,$ordk
+
+ adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
+ adcs $acc2,$acc2,$t1
+ adcs $acc3,$acc3,$t2
+ adc $acc4,$acc4,xzr
+ mov $acc5,xzr
+___
+for ($i=1;$i<4;$i++) {
+ ################################################################
+ # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
+ # * abcdefgh
+ # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+ #
+ # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+ # rewrite above as:
+ #
+ # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+ # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
+ # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
+$code.=<<___;
+ ldr $bi,[$bp,#8*$i] // b[i]
+
+ lsl $t0,$t4,#32
+ subs $acc2,$acc2,$t4
+ lsr $t1,$t4,#32
+ sbcs $acc3,$acc3,$t0
+ sbcs $acc4,$acc4,$t1
+ sbc $acc5,$acc5,xzr
+
+ subs xzr,$acc0,#1
+ umulh $t1,$ord0,$t4
+ mul $t2,$ord1,$t4
+ umulh $t3,$ord1,$t4
+
+ adcs $t2,$t2,$t1
+ mul $t0,$a0,$bi
+ adc $t3,$t3,xzr
+ mul $t1,$a1,$bi
+
+ adds $acc0,$acc1,$t2
+ mul $t2,$a2,$bi
+ adcs $acc1,$acc2,$t3
+ mul $t3,$a3,$bi
+ adcs $acc2,$acc3,$t4
+ adcs $acc3,$acc4,$t4
+ adc $acc4,$acc5,xzr
+
+ adds $acc0,$acc0,$t0 // accumulate low parts
+ umulh $t0,$a0,$bi
+ adcs $acc1,$acc1,$t1
+ umulh $t1,$a1,$bi
+ adcs $acc2,$acc2,$t2
+ umulh $t2,$a2,$bi
+ adcs $acc3,$acc3,$t3
+ umulh $t3,$a3,$bi
+ adc $acc4,$acc4,xzr
+ mul $t4,$acc0,$ordk
+ adds $acc1,$acc1,$t0 // accumulate high parts
+ adcs $acc2,$acc2,$t1
+ adcs $acc3,$acc3,$t2
+ adcs $acc4,$acc4,$t3
+ adc $acc5,xzr,xzr
+___
+}
+$code.=<<___;
+ lsl $t0,$t4,#32 // last reduction
+ subs $acc2,$acc2,$t4
+ lsr $t1,$t4,#32
+ sbcs $acc3,$acc3,$t0
+ sbcs $acc4,$acc4,$t1
+ sbc $acc5,$acc5,xzr
+
+ subs xzr,$acc0,#1
+ umulh $t1,$ord0,$t4
+ mul $t2,$ord1,$t4
+ umulh $t3,$ord1,$t4
+
+ adcs $t2,$t2,$t1
+ adc $t3,$t3,xzr
+
+ adds $acc0,$acc1,$t2
+ adcs $acc1,$acc2,$t3
+ adcs $acc2,$acc3,$t4
+ adcs $acc3,$acc4,$t4
+ adc $acc4,$acc5,xzr
+
+ subs $t0,$acc0,$ord0 // ret -= modulus
+ sbcs $t1,$acc1,$ord1
+ sbcs $t2,$acc2,$ord2
+ sbcs $t3,$acc3,$ord3
+ sbcs xzr,$acc4,xzr
+
+ csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $acc1,$acc1,$t1,lo
+ csel $acc2,$acc2,$t2,lo
+ stp $acc0,$acc1,[$rp]
+ csel $acc3,$acc3,$t3,lo
+ stp $acc2,$acc3,[$rp,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+// int rep);
+.globl ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,%function
+.align 4
+ecp_nistz256_ord_sqr_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adr $ordk,.Lord
+ ldp $a0,$a1,[$ap]
+ ldp $a2,$a3,[$ap,#16]
+
+ ldp $ord0,$ord1,[$ordk,#0]
+ ldp $ord2,$ord3,[$ordk,#16]
+ ldr $ordk,[$ordk,#32]
+ b .Loop_ord_sqr
+
+.align 4
+.Loop_ord_sqr:
+ sub $bp,$bp,#1
+ ////////////////////////////////////////////////////////////////
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul $acc1,$a1,$a0 // a[1]*a[0]
+ umulh $t1,$a1,$a0
+ mul $acc2,$a2,$a0 // a[2]*a[0]
+ umulh $t2,$a2,$a0
+ mul $acc3,$a3,$a0 // a[3]*a[0]
+ umulh $acc4,$a3,$a0
+
+ adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
+ mul $t0,$a2,$a1 // a[2]*a[1]
+ umulh $t1,$a2,$a1
+ adcs $acc3,$acc3,$t2
+ mul $t2,$a3,$a1 // a[3]*a[1]
+ umulh $t3,$a3,$a1
+ adc $acc4,$acc4,xzr // can't overflow
+
+ mul $acc5,$a3,$a2 // a[3]*a[2]
+ umulh $acc6,$a3,$a2
+
+ adds $t1,$t1,$t2 // accumulate high parts of multiplication
+ mul $acc0,$a0,$a0 // a[0]*a[0]
+ adc $t2,$t3,xzr // can't overflow
+
+ adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
+ umulh $a0,$a0,$a0
+ adcs $acc4,$acc4,$t1
+ mul $t1,$a1,$a1 // a[1]*a[1]
+ adcs $acc5,$acc5,$t2
+ umulh $a1,$a1,$a1
+ adc $acc6,$acc6,xzr // can't overflow
+
+ adds $acc1,$acc1,$acc1 // acc[1-6]*=2
+ mul $t2,$a2,$a2 // a[2]*a[2]
+ adcs $acc2,$acc2,$acc2
+ umulh $a2,$a2,$a2
+ adcs $acc3,$acc3,$acc3
+ mul $t3,$a3,$a3 // a[3]*a[3]
+ adcs $acc4,$acc4,$acc4
+ umulh $a3,$a3,$a3
+ adcs $acc5,$acc5,$acc5
+ adcs $acc6,$acc6,$acc6
+ adc $acc7,xzr,xzr
+
+ adds $acc1,$acc1,$a0 // +a[i]*a[i]
+ mul $t4,$acc0,$ordk
+ adcs $acc2,$acc2,$t1
+ adcs $acc3,$acc3,$a1
+ adcs $acc4,$acc4,$t2
+ adcs $acc5,$acc5,$a2
+ adcs $acc6,$acc6,$t3
+ adc $acc7,$acc7,$a3
+___
+for($i=0; $i<4; $i++) { # reductions
+$code.=<<___;
+ subs xzr,$acc0,#1
+ umulh $t1,$ord0,$t4
+ mul $t2,$ord1,$t4
+ umulh $t3,$ord1,$t4
+
+ adcs $t2,$t2,$t1
+ adc $t3,$t3,xzr
+
+ adds $acc0,$acc1,$t2
+ adcs $acc1,$acc2,$t3
+ adcs $acc2,$acc3,$t4
+ adc $acc3,xzr,$t4 // can't overflow
+___
+$code.=<<___ if ($i<3);
+ mul $t3,$acc0,$ordk
+___
+$code.=<<___;
+ lsl $t0,$t4,#32
+ subs $acc1,$acc1,$t4
+ lsr $t1,$t4,#32
+ sbcs $acc2,$acc2,$t0
+ sbc $acc3,$acc3,$t1 // can't borrow
+___
+ ($t3,$t4) = ($t4,$t3);
+}
+$code.=<<___;
+ adds $acc0,$acc0,$acc4 // accumulate upper half
+ adcs $acc1,$acc1,$acc5
+ adcs $acc2,$acc2,$acc6
+ adcs $acc3,$acc3,$acc7
+ adc $acc4,xzr,xzr
+
+ subs $t0,$acc0,$ord0 // ret -= modulus
+ sbcs $t1,$acc1,$ord1
+ sbcs $t2,$acc2,$ord2
+ sbcs $t3,$acc3,$ord3
+ sbcs xzr,$acc4,xzr
+
+ csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
+ csel $a1,$acc1,$t1,lo
+ csel $a2,$acc2,$t2,lo
+ csel $a3,$acc3,$t3,lo
+
+ cbnz $bp,.Loop_ord_sqr
+
+ stp $a0,$a1,[$rp]
+ stp $a2,$a3,[$rp,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+} }
+
+########################################################################
+# select subroutines
+# These select functions are similar to those in p256-x86_64-asm.pl
+# They load all points in the lookup table
+# keeping in the output only the one corresponding to the input index.
+{
+my ($val,$in_t)=map("x$_",(0..1));
+my ($index)=("w2");
+my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11");
+my ($Mask)=("v3");
+my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21));
+my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27));
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w5
+.type ecp_nistz256_select_w5,%function
+.align 4
+ecp_nistz256_select_w5:
+ AARCH64_VALID_CALL_TARGET
+
+ // $Val_in := $val
+ // $Idx_ctr := 0; loop counter and incremented internal index
+ mov $Val_in, $val
+ mov $Idx_ctr, #0
+
+ // [$Ra-$Rf] := 0
+ movi $Ra.16b, #0
+ movi $Rb.16b, #0
+ movi $Rc.16b, #0
+ movi $Rd.16b, #0
+ movi $Re.16b, #0
+ movi $Rf.16b, #0
+
+.Lselect_w5_loop:
+ // Loop 16 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add $Idx_ctr, $Idx_ctr, #1
+
+ // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t
+ // and advance $in_t to point to the next entry
+ ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
+
+ // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
+ cmp $Idx_ctr, $index
+ csetm $Mask_64, eq
+
+ // continue loading ...
+ ld1 {$T0e.2d, $T0f.2d}, [$in_t],#32
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup $Mask.2d, $Mask_64
+
+ // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
+ // i.e., values in output registers will remain the same if $Idx_ctr != $index
+ bit $Ra.16b, $T0a.16b, $Mask.16b
+ bit $Rb.16b, $T0b.16b, $Mask.16b
+
+ bit $Rc.16b, $T0c.16b, $Mask.16b
+ bit $Rd.16b, $T0d.16b, $Mask.16b
+
+ bit $Re.16b, $T0e.16b, $Mask.16b
+ bit $Rf.16b, $T0f.16b, $Mask.16b
+
+ // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+ tbz $Idx_ctr, #4, .Lselect_w5_loop
+
+ // Write [$Ra-$Rf] to memory at the output pointer
+ st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64
+ st1 {$Re.2d, $Rf.2d}, [$Val_in]
+
+ ret
+.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w7
+.type ecp_nistz256_select_w7,%function
+.align 4
+ecp_nistz256_select_w7:
+ AARCH64_VALID_CALL_TARGET
+
+ // $Idx_ctr := 0; loop counter and incremented internal index
+ mov $Idx_ctr, #0
+
+ // [$Ra-$Rf] := 0
+ movi $Ra.16b, #0
+ movi $Rb.16b, #0
+ movi $Rc.16b, #0
+ movi $Rd.16b, #0
+
+.Lselect_w7_loop:
+ // Loop 64 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add $Idx_ctr, $Idx_ctr, #1
+
+ // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t
+ // and advance $in_t to point to the next entry
+ ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
+
+ // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
+ cmp $Idx_ctr, $index
+ csetm $Mask_64, eq
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup $Mask.2d, $Mask_64
+
+ // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
+ // i.e., values in output registers will remain the same if $Idx_ctr != $index
+ bit $Ra.16b, $T0a.16b, $Mask.16b
+ bit $Rb.16b, $T0b.16b, $Mask.16b
+
+ bit $Rc.16b, $T0c.16b, $Mask.16b
+ bit $Rd.16b, $T0d.16b, $Mask.16b
+
+ // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+ tbz $Idx_ctr, #6, .Lselect_w7_loop
+
+ // Write [$Ra-$Rd] to memory at the output pointer
+ st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val]
+
+ ret
+.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+___
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
new file mode 100644
index 0000000..e259aef
--- /dev/null
+++ b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
@@ -0,0 +1,455 @@
+# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#
+#
+# This code is based on p256_beeu-x86_64-asm.pl (which is based on BN_mod_inverse_odd).
+#
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+ Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+#############################################################################
+# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+# BN_ULONG a[P256_LIMBS],
+# BN_ULONG n[P256_LIMBS]);
+#
+# (Binary Extended GCD (Euclidean) Algorithm.
+# See A. Menezes, P. vanOorschot, and S. Vanstone's Handbook of Applied Cryptography,
+# Chapter 14, Algorithm 14.61 and Note 14.64
+# http://cacr.uwaterloo.ca/hac/about/chap14.pdf)
+
+# Assumption 1: n is odd for the BEEU
+# Assumption 2: 1 < a < n < 2^256
+
+# Details
+# The inverse of x modulo y can be calculated using Alg. 14.61, where "a" would be that inverse.
+# In other words,
+# ax == 1 (mod y) (where the symbol “==“ denotes ”congruent“)
+# a == x^{-1} (mod y)
+#
+# It can be shown that throughout all the iterations of the algorithm, the following holds:
+# u = Ax + By
+# v = Cx + Dy
+# The values B and D are not of interest in this case, so they need not be computed by the algorithm.
+# This means the following congruences hold through the iterations of the algorithm.
+# Ax == u (mod y)
+# Cx == v (mod y)
+
+# Now we will modify the notation to match that of BN_mod_inverse_odd()
+# on which beeu_mod_inverse_vartime() in `p256_beeu-x86_64-asm` is based.
+# In those functions:
+# x, y -> a, n
+# u, v -> B, A
+# A, C -> X, Y’, where Y’ = -Y
+# Hence, the following holds throughout the algorithm iterations
+# Xa == B (mod n)
+# -Ya == A (mod n)
+#
+# Same algorithm in Python:
+# def beeu(a, n):
+# X = 1
+# Y = 0
+# B = a
+# A = n
+# while (B != 0):
+# while (B % 2) == 0:
+# B >>= 1
+# if (X % 2) == 1:
+# X = X + n
+# X >>= 1
+# while (A % 2) == 0:
+# A >>= 1
+# if (Y % 2) == 1:
+# Y = Y + n
+# Y >>= 1
+# if (B >= A):
+# B = B - A
+# X = X + Y
+# else:
+# A = A - B
+# Y = Y + X
+# if (A != 1):
+# # error
+# return 0
+# else:
+# while (Y > n):
+# Y = Y - n
+# Y = n - Y
+# return Y
+
+
+# For the internal variables,
+# x0-x2, x30 are used to hold the modulus n. The input parameters passed in
+# x1,x2 are copied first before corrupting them. x0 (out) is stored on the stack.
+# x3-x7 are used for parameters, which is not the case in this function, so they are corruptible
+# x8 is corruptible here
+# (the function doesn't return a struct, hence x8 doesn't contain a passed-in address
+# for that struct).
+# x9-x15 are corruptible registers
+# x19-x28 are callee-saved registers
+
+# X/Y will hold the inverse parameter
+# Assumption: a,n,X,Y < 2^(256)
+# Initially, X := 1, Y := 0
+# A := n, B := a
+
+# Function parameters (as per the Procedure Call Standard)
+my($out, $a_in, $n_in)=map("x$_",(0..2));
+# Internal variables
+my($n0, $n1, $n2, $n3)=map("x$_",(0..2,30));
+my($x0, $x1, $x2, $x3, $x4)=map("x$_",(3..7));
+my($y0, $y1, $y2, $y3, $y4)=map("x$_",(8..12));
+my($shift)=("x13");
+my($t0, $t1, $t2, $t3)=map("x$_",(14,15,19,20));
+my($a0, $a1, $a2, $a3)=map("x$_",(21..24));
+my($b0, $b1, $b2, $b3)=map("x$_",(25..28));
+
+# if B == 0, jump to end of loop
+sub TEST_B_ZERO {
+ return <<___;
+ orr $t0, $b0, $b1
+ orr $t0, $t0, $b2
+
+ // reverse the bit order of $b0. This is needed for clz after this macro
+ rbit $t1, $b0
+
+ orr $t0, $t0, $b3
+ cbz $t0,.Lbeeu_loop_end
+___
+}
+
+# Shift right by 1 bit, adding the modulus first if the variable is odd
+# if least_sig_bit(var0) == 0,
+# goto shift1_<ctr>
+# else
+# add n and goto shift1_<ctr>
+# Prerequisite: t0 = 0
+$g_next_label = 0;
+sub SHIFT1 {
+ my ($var0, $var1, $var2, $var3, $var4) = @_;
+ my $label = ".Lshift1_${g_next_label}";
+ $g_next_label++;
+ return <<___;
+ tbz $var0, #0, $label
+ adds $var0, $var0, $n0
+ adcs $var1, $var1, $n1
+ adcs $var2, $var2, $n2
+ adcs $var3, $var3, $n3
+ adc $var4, $var4, $t0
+$label:
+ // var0 := [var1|var0]<64..1>;
+ // i.e. concatenate var1 and var0,
+ // extract bits <64..1> from the resulting 128-bit value
+ // and put them in var0
+ extr $var0, $var1, $var0, #1
+ extr $var1, $var2, $var1, #1
+ extr $var2, $var3, $var2, #1
+ extr $var3, $var4, $var3, #1
+ lsr $var4, $var4, #1
+___
+}
+
+# compilation by clang 10.0.0 with -O2/-O3 of
+# a[0] = (a[0] >> count) | (a[1] << (64-count));
+# a[1] = (a[1] >> count) | (a[2] << (64-count));
+# a[2] = (a[2] >> count) | (a[3] << (64-count));
+# a[3] >>= count;
+# Note: EXTR instruction used in SHIFT1 is similar to x86_64's SHRDQ
+# except that the second source operand of EXTR is only immediate;
+# that's why it cannot be used here where $shift is a variable
+#
+# In the following,
+# t0 := 0 - shift
+#
+# then var0, for example, will be shifted right as follows:
+# var0 := (var0 >> (uint(shift) mod 64)) | (var1 << (uint(t0) mod 64))
+# "uint() mod 64" is from the definition of LSL and LSR instructions.
+#
+# What matters here is the order of instructions relative to certain other
+# instructions, i.e.
+# - lsr and lsl must precede orr of the corresponding registers.
+# - lsl must preced the lsr of the same register afterwards.
+# The chosen order of the instructions overall is to try and maximize
+# the pipeline usage.
+sub SHIFT256 {
+ my ($var0, $var1, $var2, $var3) = @_;
+ return <<___;
+ neg $t0, $shift
+ lsr $var0, $var0, $shift
+ lsl $t1, $var1, $t0
+
+ lsr $var1, $var1, $shift
+ lsl $t2, $var2, $t0
+
+ orr $var0, $var0, $t1
+
+ lsr $var2, $var2, $shift
+ lsl $t3, $var3, $t0
+
+ orr $var1, $var1, $t2
+
+ lsr $var3, $var3, $shift
+
+ orr $var2, $var2, $t3
+___
+}
+
+$code.=<<___;
+#include "openssl/arm_arch.h"
+
+.text
+.globl beeu_mod_inverse_vartime
+.type beeu_mod_inverse_vartime, %function
+.align 4
+beeu_mod_inverse_vartime:
+ // Reserve enough space for 14 8-byte registers on the stack
+ // in the first stp call for x29, x30.
+ // Then store the remaining callee-saved registers.
+ //
+ // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
+ // ^ ^
+ // sp <------------------- 112 bytes ----------------> old sp
+ // x29 (FP)
+ //
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-112]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x2,[sp,#96]
+
+ // B = b3..b0 := a
+ ldp $b0,$b1,[$a_in]
+ ldp $b2,$b3,[$a_in,#16]
+
+ // n3..n0 := n
+ // Note: the value of input params are changed in the following.
+ ldp $n0,$n1,[$n_in]
+ ldp $n2,$n3,[$n_in,#16]
+
+ // A = a3..a0 := n
+ mov $a0, $n0
+ mov $a1, $n1
+ mov $a2, $n2
+ mov $a3, $n3
+
+ // X = x4..x0 := 1
+ mov $x0, #1
+ eor $x1, $x1, $x1
+ eor $x2, $x2, $x2
+ eor $x3, $x3, $x3
+ eor $x4, $x4, $x4
+
+ // Y = y4..y0 := 0
+ eor $y0, $y0, $y0
+ eor $y1, $y1, $y1
+ eor $y2, $y2, $y2
+ eor $y3, $y3, $y3
+ eor $y4, $y4, $y4
+
+.Lbeeu_loop:
+ // if B == 0, jump to .Lbeeu_loop_end
+ ${\TEST_B_ZERO}
+
+ // 0 < B < |n|,
+ // 0 < A <= |n|,
+ // (1) X*a == B (mod |n|),
+ // (2) (-1)*Y*a == A (mod |n|)
+
+ // Now divide B by the maximum possible power of two in the
+ // integers, and divide X by the same value mod |n|.
+ // When we're done, (1) still holds.
+
+ // shift := number of trailing 0s in $b0
+ // ( = number of leading 0s in $t1; see the "rbit" instruction in TEST_B_ZERO)
+ clz $shift, $t1
+
+ // If there is no shift, goto shift_A_Y
+ cbz $shift, .Lbeeu_shift_A_Y
+
+ // Shift B right by "$shift" bits
+ ${\SHIFT256($b0, $b1, $b2, $b3)}
+
+ // Shift X right by "$shift" bits, adding n whenever X becomes odd.
+ // $shift--;
+ // $t0 := 0; needed in the addition to the most significant word in SHIFT1
+ eor $t0, $t0, $t0
+.Lbeeu_shift_loop_X:
+ ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
+ subs $shift, $shift, #1
+ bne .Lbeeu_shift_loop_X
+
+ // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+ // with the following differences:
+ // - "$shift" is set directly to the number of trailing 0s in B
+ // (using rbit and clz instructions)
+ // - The loop is only used to call SHIFT1(X)
+ // and $shift is decreased while executing the X loop.
+ // - SHIFT256(B, $shift) is performed before right-shifting X; they are independent
+
+.Lbeeu_shift_A_Y:
+ // Same for A and Y.
+ // Afterwards, (2) still holds.
+ // Reverse the bit order of $a0
+ // $shift := number of trailing 0s in $a0 (= number of leading 0s in $t1)
+ rbit $t1, $a0
+ clz $shift, $t1
+
+ // If there is no shift, goto |B-A|, X+Y update
+ cbz $shift, .Lbeeu_update_B_X_or_A_Y
+
+ // Shift A right by "$shift" bits
+ ${\SHIFT256($a0, $a1, $a2, $a3)}
+
+ // Shift Y right by "$shift" bits, adding n whenever Y becomes odd.
+ // $shift--;
+ // $t0 := 0; needed in the addition to the most significant word in SHIFT1
+ eor $t0, $t0, $t0
+.Lbeeu_shift_loop_Y:
+ ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
+ subs $shift, $shift, #1
+ bne .Lbeeu_shift_loop_Y
+
+.Lbeeu_update_B_X_or_A_Y:
+ // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+ // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+ // without taking a sign bit if generated. The lack of a carry would
+ // indicate a negative result. See, for example,
+ // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+ subs $t0, $b0, $a0
+ sbcs $t1, $b1, $a1
+ sbcs $t2, $b2, $a2
+ sbcs $t3, $b3, $a3
+ bcs .Lbeeu_B_greater_than_A
+
+ // Else A > B =>
+ // A := A - B; Y := Y + X; goto beginning of the loop
+ subs $a0, $a0, $b0
+ sbcs $a1, $a1, $b1
+ sbcs $a2, $a2, $b2
+ sbcs $a3, $a3, $b3
+
+ adds $y0, $y0, $x0
+ adcs $y1, $y1, $x1
+ adcs $y2, $y2, $x2
+ adcs $y3, $y3, $x3
+ adc $y4, $y4, $x4
+ b .Lbeeu_loop
+
+.Lbeeu_B_greater_than_A:
+ // Continue with B > A =>
+ // B := B - A; X := X + Y; goto beginning of the loop
+ mov $b0, $t0
+ mov $b1, $t1
+ mov $b2, $t2
+ mov $b3, $t3
+
+ adds $x0, $x0, $y0
+ adcs $x1, $x1, $y1
+ adcs $x2, $x2, $y2
+ adcs $x3, $x3, $y3
+ adc $x4, $x4, $y4
+ b .Lbeeu_loop
+
+.Lbeeu_loop_end:
+ // The Euclid's algorithm loop ends when A == gcd(a,n);
+ // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+ // Since (-1)*Y*a == A (mod |n|), Y>0
+ // then out = -Y mod n
+
+ // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
+ // Is A-1 == 0?
+ // If not, fail.
+ sub $t0, $a0, #1
+ orr $t0, $t0, $a1
+ orr $t0, $t0, $a2
+ orr $t0, $t0, $a3
+ cbnz $t0, .Lbeeu_err
+
+ // If Y>n ==> Y:=Y-n
+.Lbeeu_reduction_loop:
+ // x_i := y_i - n_i (X is no longer needed, use it as temp)
+ // ($t0 = 0 from above)
+ subs $x0, $y0, $n0
+ sbcs $x1, $y1, $n1
+ sbcs $x2, $y2, $n2
+ sbcs $x3, $y3, $n3
+ sbcs $x4, $y4, $t0
+
+ // If result is non-negative (i.e., cs = carry set = no borrow),
+ // y_i := x_i; goto reduce again
+ // else
+ // y_i := y_i; continue
+ csel $y0, $x0, $y0, cs
+ csel $y1, $x1, $y1, cs
+ csel $y2, $x2, $y2, cs
+ csel $y3, $x3, $y3, cs
+ csel $y4, $x4, $y4, cs
+ bcs .Lbeeu_reduction_loop
+
+ // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+ // out = -Y = n-Y
+ subs $y0, $n0, $y0
+ sbcs $y1, $n1, $y1
+ sbcs $y2, $n2, $y2
+ sbcs $y3, $n3, $y3
+
+ // Save Y in output (out (x0) was saved on the stack)
+ ldr x3, [sp,#96]
+ stp $y0, $y1, [x3]
+ stp $y2, $y3, [x3,#16]
+ // return 1 (success)
+ mov x0, #1
+ b .Lbeeu_finish
+
+.Lbeeu_err:
+ // return 0 (error)
+ eor x0, x0, x0
+
+.Lbeeu_finish:
+ // Restore callee-saved registers, except x0, x2
+ add sp,x29,#0
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldp x29,x30,[sp],#112
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
+___
+
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c
index 93fdcfc..133f561 100644
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -246,7 +246,8 @@
out->curves[2].param_len = 32;
out->curves[2].params = kP256Params;
out->curves[2].method =
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(OPENSSL_SMALL)
EC_GFp_nistz256_method();
#else
diff --git a/crypto/fipsmodule/ec/make_p256-x86_64-tests.go b/crypto/fipsmodule/ec/make_p256-nistz-tests.go
similarity index 96%
rename from crypto/fipsmodule/ec/make_p256-x86_64-tests.go
rename to crypto/fipsmodule/ec/make_p256-nistz-tests.go
index 958a97a..36194e6 100644
--- a/crypto/fipsmodule/ec/make_p256-x86_64-tests.go
+++ b/crypto/fipsmodule/ec/make_p256-nistz-tests.go
@@ -69,7 +69,7 @@
func isAffineInfinity(x, y *big.Int) bool {
// Infinity, in affine coordinates, is represented as (0, 0) by
- // both Go and p256-x86_64-asm.pl.
+ // both Go, p256-x86_64-asm.pl and p256-armv8-asm.pl.
return x.Sign() == 0 && y.Sign() == 0
}
@@ -107,8 +107,8 @@
// arbitrary X and Y and include the special case. We also have
// not verified that add and double preserve this
// property. Thus, generate test vectors with unrelated X and Y,
- // to test that p256-x86_64-asm.pl correctly handles
- // unconstrained representations of infinity.
+ // to test that p256-x86_64-asm.pl and p256-armv8-asm.pl correctly
+ // handle unconstrained representations of infinity.
x = randNonZeroInt(p)
y = randNonZeroInt(p)
z = zero
diff --git a/crypto/fipsmodule/ec/make_tables.go b/crypto/fipsmodule/ec/make_tables.go
index 34e8c23..dbcaab0 100644
--- a/crypto/fipsmodule/ec/make_tables.go
+++ b/crypto/fipsmodule/ec/make_tables.go
@@ -23,8 +23,8 @@
)
func main() {
- if err := writeP256X86_64Table("p256-x86_64-table.h"); err != nil {
- fmt.Fprintf(os.Stderr, "Error writing p256-x86_64-table.h: %s\n", err)
+ if err := writeP256NistzTable("p256-nistz-table.h"); err != nil {
+ fmt.Fprintf(os.Stderr, "Error writing p256-nistz-table.h: %s\n", err)
os.Exit(1)
}
@@ -34,7 +34,7 @@
}
}
-func writeP256X86_64Table(path string) error {
+func writeP256NistzTable(path string) error {
curve := elliptic.P256()
tables := make([][][2]*big.Int, 0, 37)
for shift := 0; shift < 256; shift += 7 {
@@ -59,7 +59,7 @@
*/
// This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
// subtables, each subtable contains 64 affine points. The affine points are
// encoded as eight uint64's, four for the x coordinate and four for the y.
// Both values are in little-endian order. There are 37 tables because a
diff --git a/crypto/fipsmodule/ec/p256-x86_64-table.h b/crypto/fipsmodule/ec/p256-nistz-table.h
similarity index 99%
rename from crypto/fipsmodule/ec/p256-x86_64-table.h
rename to crypto/fipsmodule/ec/p256-nistz-table.h
index 3af0b01..b81480b 100644
--- a/crypto/fipsmodule/ec/p256-x86_64-table.h
+++ b/crypto/fipsmodule/ec/p256-nistz-table.h
@@ -9,7 +9,7 @@
*/
// This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
// subtables, each subtable contains 64 affine points. The affine points are
// encoded as eight uint64's, four for the x coordinate and four for the y.
// Both values are in little-endian order. There are 37 tables because a
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-nistz.c
similarity index 98%
rename from crypto/fipsmodule/ec/p256-x86_64.c
rename to crypto/fipsmodule/ec/p256-nistz.c
index 506b7d2..12a4da6 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@@ -30,10 +30,10 @@
#include "../delocate.h"
#include "../../internal.h"
#include "internal.h"
-#include "p256-x86_64.h"
+#include "p256-nistz.h"
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(OPENSSL_SMALL)
typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
@@ -45,7 +45,7 @@
};
// Precomputed tables for the default generator
-#include "p256-x86_64-table.h"
+#include "p256-nistz-table.h"
// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in
// util.c for details
@@ -554,10 +554,12 @@
static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group,
EC_SCALAR *out,
const EC_SCALAR *in) {
+#if defined(OPENSSL_X86_64)
if (!CRYPTO_is_AVX_capable()) {
// No AVX support; fallback to generic code.
return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in);
}
+#endif
assert(group->order.width == P256_LIMBS);
if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.d)) {
@@ -628,5 +630,6 @@
out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
}
-#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#endif /* !defined(OPENSSL_NO_ASM) && \
+ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(OPENSSL_SMALL) */
diff --git a/crypto/fipsmodule/ec/p256-x86_64.h b/crypto/fipsmodule/ec/p256-nistz.h
similarity index 95%
rename from crypto/fipsmodule/ec/p256-x86_64.h
rename to crypto/fipsmodule/ec/p256-nistz.h
index 5deb81a..0d0a6be 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/crypto/fipsmodule/ec/p256-nistz.h
@@ -30,7 +30,8 @@
#endif
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(OPENSSL_SMALL)
// P-256 field operations.
@@ -142,8 +143,9 @@
void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
const P256_POINT_AFFINE *b);
-#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
- !defined(OPENSSL_SMALL) */
+#endif /* !defined(OPENSSL_NO_ASM) && \
+ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
+ !defined(OPENSSL_SMALL) */
#if defined(__cplusplus)
diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-nistz_test.cc
similarity index 97%
rename from crypto/fipsmodule/ec/p256-x86_64_test.cc
rename to crypto/fipsmodule/ec/p256-nistz_test.cc
index f6f070a..73944db 100644
--- a/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/crypto/fipsmodule/ec/p256-nistz_test.cc
@@ -30,15 +30,16 @@
#include "../../test/abi_test.h"
#include "../../test/file_test.h"
#include "../../test/test_util.h"
-#include "p256-x86_64.h"
+#include "p256-nistz.h"
// Disable tests if BORINGSSL_SHARED_LIBRARY is defined. These tests need access
// to internal functions.
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(OPENSSL_SMALL) && !defined(BORINGSSL_SHARED_LIBRARY)
-TEST(P256_X86_64Test, SelectW5) {
+TEST(P256_NistzTest, SelectW5) {
// Fill a table with some garbage input.
alignas(64) P256_POINT table[16];
for (size_t i = 0; i < 16; i++) {
@@ -68,7 +69,7 @@
CHECK_ABI(ecp_nistz256_select_w5, &val, table, 7);
}
-TEST(P256_X86_64Test, SelectW7) {
+TEST(P256_NistzTest, SelectW7) {
// Fill a table with some garbage input.
alignas(64) P256_POINT_AFFINE table[64];
for (size_t i = 0; i < 64; i++) {
@@ -97,11 +98,13 @@
CHECK_ABI(ecp_nistz256_select_w7, &val, table, 42);
}
-TEST(P256_X86_64Test, BEEU) {
+TEST(P256_NistzTest, BEEU) {
+#if defined(OPENSSL_X86_64)
if (!CRYPTO_is_AVX_capable()) {
// No AVX support; cannot run the BEEU code.
return;
}
+#endif
bssl::UniquePtr<EC_GROUP> group(
EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
@@ -483,8 +486,8 @@
}
}
-TEST(P256_X86_64Test, TestVectors) {
- return FileTestGTest("crypto/fipsmodule/ec/p256-x86_64_tests.txt",
+TEST(P256_NistzTest, TestVectors) {
+ return FileTestGTest("crypto/fipsmodule/ec/p256-nistz_tests.txt",
[](FileTest *t) {
if (t->GetParameter() == "Negate") {
TestNegate(t);
@@ -503,7 +506,7 @@
}
// Instrument the functions covered in TestVectors for ABI checking.
-TEST(P256_X86_64Test, ABI) {
+TEST(P256_NistzTest, ABI) {
BN_ULONG a[P256_LIMBS], b[P256_LIMBS], c[P256_LIMBS];
OPENSSL_memset(a, 0x01, sizeof(a));
// These functions are all constant-time, so it is only necessary to
diff --git a/crypto/fipsmodule/ec/p256-x86_64_tests.txt b/crypto/fipsmodule/ec/p256-nistz_tests.txt
similarity index 100%
rename from crypto/fipsmodule/ec/p256-x86_64_tests.txt
rename to crypto/fipsmodule/ec/p256-nistz_tests.txt
diff --git a/sources.cmake b/sources.cmake
index 3d3465f..434b3c2 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -49,7 +49,7 @@
crypto/fipsmodule/bn/bn_tests.txt
crypto/fipsmodule/bn/miller_rabin_tests.txt
crypto/fipsmodule/ec/ec_scalar_base_mult_tests.txt
- crypto/fipsmodule/ec/p256-x86_64_tests.txt
+ crypto/fipsmodule/ec/p256-nistz_tests.txt
crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt
crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
crypto/fipsmodule/modes/gcm_tests.txt
diff --git a/util/fipstools/delocate/delocate.go b/util/fipstools/delocate/delocate.go
index 5d4b1f4..55c8671 100644
--- a/util/fipstools/delocate/delocate.go
+++ b/util/fipstools/delocate/delocate.go
@@ -509,7 +509,7 @@
// This is a branch. Either the target needs to be written to a local
// version of the symbol to ensure that no relocations are emitted, or
// it needs to jump to a redirector function.
- symbol, _, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up)
+ symbol, offset, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up)
changed = didChange
if _, knownSymbol := d.symbols[symbol]; knownSymbol {
@@ -520,6 +520,13 @@
d.redirectors[symbol] = redirector
symbol = redirector
changed = true
+ } else if didChange && symbolIsLocal && len(offset) > 0 {
+ // didChange is set when the inputFile index is not 0; which is the index of the
+ // first file copied to the output, which is the generated assembly of bcm.c.
+ // In subsequently copied assembly files, local symbols are changed by appending (BCM_ + index)
+ // in order to ensure they don't collide. `index` gets incremented per file.
+ // If there is offset after the symbol, append the `offset`.
+ symbol = symbol + offset
}
args = append(args, symbol)
diff --git a/util/fipstools/delocate/delocate.peg b/util/fipstools/delocate/delocate.peg
index c253a48..8267065 100644
--- a/util/fipstools/delocate/delocate.peg
+++ b/util/fipstools/delocate/delocate.peg
@@ -94,7 +94,7 @@
BaseIndexScale)
SymbolRef <- (Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)?
Low12BitsSymbolRef <- ":lo12:" (LocalSymbol / SymbolName) Offset?
-ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?
+ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / (('+' [0-9]+)*))? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?
ARMGOTLow12 <- ":got_lo12:" SymbolName
ARMPostincrement <- '!'
BaseIndexScale <- '(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)? )? ')'
diff --git a/util/fipstools/delocate/delocate.peg.go b/util/fipstools/delocate/delocate.peg.go
index ea7c195..6f5c654 100644
--- a/util/fipstools/delocate/delocate.peg.go
+++ b/util/fipstools/delocate/delocate.peg.go
@@ -1,10 +1,14 @@
package main
+// Code generated by ./peg/peg delocate.peg DO NOT EDIT.
+
import (
"fmt"
- "math"
+ "io"
+ "os"
"sort"
"strconv"
+ "strings"
)
const endSymbol rune = 1114112
@@ -142,19 +146,19 @@
up, next *node32
}
-func (node *node32) print(pretty bool, buffer string) {
+func (node *node32) print(w io.Writer, pretty bool, buffer string) {
var print func(node *node32, depth int)
print = func(node *node32, depth int) {
for node != nil {
for c := 0; c < depth; c++ {
- fmt.Printf(" ")
+ fmt.Fprintf(w, " ")
}
rule := rul3s[node.pegRule]
quote := strconv.Quote(string(([]rune(buffer)[node.begin:node.end])))
if !pretty {
- fmt.Printf("%v %v\n", rule, quote)
+ fmt.Fprintf(w, "%v %v\n", rule, quote)
} else {
- fmt.Printf("\x1B[34m%v\x1B[m %v\n", rule, quote)
+ fmt.Fprintf(w, "\x1B[36m%v\x1B[m %v\n", rule, quote)
}
if node.up != nil {
print(node.up, depth+1)
@@ -165,12 +169,12 @@
print(node, 0)
}
-func (node *node32) Print(buffer string) {
- node.print(false, buffer)
+func (node *node32) Print(w io.Writer, buffer string) {
+ node.print(w, false, buffer)
}
-func (node *node32) PrettyPrint(buffer string) {
- node.print(true, buffer)
+func (node *node32) PrettyPrint(w io.Writer, buffer string) {
+ node.print(w, true, buffer)
}
type tokens32 struct {
@@ -213,24 +217,24 @@
}
func (t *tokens32) PrintSyntaxTree(buffer string) {
- t.AST().Print(buffer)
+ t.AST().Print(os.Stdout, buffer)
+}
+
+func (t *tokens32) WriteSyntaxTree(w io.Writer, buffer string) {
+ t.AST().Print(w, buffer)
}
func (t *tokens32) PrettyPrintSyntaxTree(buffer string) {
- t.AST().PrettyPrint(buffer)
+ t.AST().PrettyPrint(os.Stdout, buffer)
}
func (t *tokens32) Add(rule pegRule, begin, end, index uint32) {
- if tree := t.tree; int(index) >= len(tree) {
- expanded := make([]token32, 2*len(tree))
- copy(expanded, tree)
- t.tree = expanded
+ tree, i := t.tree, int(index)
+ if i >= len(tree) {
+ t.tree = append(tree, token32{pegRule: rule, begin: begin, end: end})
+ return
}
- t.tree[index] = token32{
- pegRule: rule,
- begin: begin,
- end: end,
- }
+ tree[i] = token32{pegRule: rule, begin: begin, end: end}
}
func (t *tokens32) Tokens() []token32 {
@@ -292,7 +296,7 @@
}
func (e *parseError) Error() string {
- tokens, error := []token32{e.max}, "\n"
+ tokens, err := []token32{e.max}, "\n"
positions, p := make([]int, 2*len(tokens)), 0
for _, token := range tokens {
positions[p], p = int(token.begin), p+1
@@ -305,14 +309,14 @@
}
for _, token := range tokens {
begin, end := int(token.begin), int(token.end)
- error += fmt.Sprintf(format,
+ err += fmt.Sprintf(format,
rul3s[token.pegRule],
translations[begin].line, translations[begin].symbol,
translations[end].line, translations[end].symbol,
strconv.Quote(string(e.p.buffer[begin:end])))
}
- return error
+ return err
}
func (p *Asm) PrintSyntaxTree() {
@@ -323,12 +327,41 @@
}
}
-func (p *Asm) Init() {
+func (p *Asm) WriteSyntaxTree(w io.Writer) {
+ p.tokens32.WriteSyntaxTree(w, p.Buffer)
+}
+
+func (p *Asm) SprintSyntaxTree() string {
+ var bldr strings.Builder
+ p.WriteSyntaxTree(&bldr)
+ return bldr.String()
+}
+
+func Pretty(pretty bool) func(*Asm) error {
+ return func(p *Asm) error {
+ p.Pretty = pretty
+ return nil
+ }
+}
+
+func Size(size int) func(*Asm) error {
+ return func(p *Asm) error {
+ p.tokens32 = tokens32{tree: make([]token32, 0, size)}
+ return nil
+ }
+}
+func (p *Asm) Init(options ...func(*Asm) error) error {
var (
max token32
position, tokenIndex uint32
buffer []rune
)
+ for _, option := range options {
+ err := option(p)
+ if err != nil {
+ return err
+ }
+ }
p.reset = func() {
max = token32{}
position, tokenIndex = 0, 0
@@ -342,7 +375,7 @@
p.reset()
_rules := p.rules
- tree := tokens32{tree: make([]token32, math.MaxInt16)}
+ tree := p.tokens32
p.parse = func(rule ...int) error {
r := 1
if len(rule) > 0 {
@@ -5708,7 +5741,7 @@
position, tokenIndex = position727, tokenIndex727
return false
},
- /* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */
+ /* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / ('+' [0-9]+)*)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */
func() bool {
position737, tokenIndex737 := position, tokenIndex
{
@@ -5747,27 +5780,108 @@
}
{
position745, tokenIndex745 := position, tokenIndex
- if buffer[position] != rune('*') {
- goto l745
- }
- position++
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l745
- }
- position++
- l747:
{
- position748, tokenIndex748 := position, tokenIndex
+ position747, tokenIndex747 := position, tokenIndex
+ if buffer[position] != rune('*') {
+ goto l748
+ }
+ position++
if c := buffer[position]; c < rune('0') || c > rune('9') {
goto l748
}
position++
+ l749:
+ {
+ position750, tokenIndex750 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l750
+ }
+ position++
+ goto l749
+ l750:
+ position, tokenIndex = position750, tokenIndex750
+ }
goto l747
l748:
- position, tokenIndex = position748, tokenIndex748
+ position, tokenIndex = position747, tokenIndex747
+ if buffer[position] != rune('*') {
+ goto l751
+ }
+ position++
+ if buffer[position] != rune('(') {
+ goto l751
+ }
+ position++
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l751
+ }
+ position++
+ l752:
+ {
+ position753, tokenIndex753 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l753
+ }
+ position++
+ goto l752
+ l753:
+ position, tokenIndex = position753, tokenIndex753
+ }
+ if !_rules[ruleOperator]() {
+ goto l751
+ }
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l751
+ }
+ position++
+ l754:
+ {
+ position755, tokenIndex755 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l755
+ }
+ position++
+ goto l754
+ l755:
+ position, tokenIndex = position755, tokenIndex755
+ }
+ if buffer[position] != rune(')') {
+ goto l751
+ }
+ position++
+ goto l747
+ l751:
+ position, tokenIndex = position747, tokenIndex747
+ l756:
+ {
+ position757, tokenIndex757 := position, tokenIndex
+ if buffer[position] != rune('+') {
+ goto l757
+ }
+ position++
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l757
+ }
+ position++
+ l758:
+ {
+ position759, tokenIndex759 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l759
+ }
+ position++
+ goto l758
+ l759:
+ position, tokenIndex = position759, tokenIndex759
+ }
+ goto l756
+ l757:
+ position, tokenIndex = position757, tokenIndex757
+ }
}
+ l747:
goto l746
- l745:
+
position, tokenIndex = position745, tokenIndex745
}
l746:
@@ -5775,16 +5889,16 @@
l744:
position, tokenIndex = position743, tokenIndex743
if !_rules[ruleARMGOTLow12]() {
- goto l749
+ goto l760
}
goto l743
- l749:
+ l760:
position, tokenIndex = position743, tokenIndex743
if !_rules[ruleLow12BitsSymbolRef]() {
- goto l750
+ goto l761
}
goto l743
- l750:
+ l761:
position, tokenIndex = position743, tokenIndex743
if !_rules[ruleARMRegister]() {
goto l739
@@ -5792,29 +5906,29 @@
}
l743:
{
- position751, tokenIndex751 := position, tokenIndex
+ position762, tokenIndex762 := position, tokenIndex
if buffer[position] != rune(',') {
- goto l751
+ goto l762
}
position++
{
- position753, tokenIndex753 := position, tokenIndex
+ position764, tokenIndex764 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l753
+ goto l764
}
- goto l754
- l753:
- position, tokenIndex = position753, tokenIndex753
+ goto l765
+ l764:
+ position, tokenIndex = position764, tokenIndex764
}
- l754:
+ l765:
if !_rules[ruleARMConstantTweak]() {
- goto l751
+ goto l762
}
- goto l752
- l751:
- position, tokenIndex = position751, tokenIndex751
+ goto l763
+ l762:
+ position, tokenIndex = position762, tokenIndex762
}
- l752:
+ l763:
goto l740
l739:
position, tokenIndex = position739, tokenIndex739
@@ -5825,15 +5939,15 @@
}
position++
{
- position755, tokenIndex755 := position, tokenIndex
+ position766, tokenIndex766 := position, tokenIndex
if !_rules[ruleARMPostincrement]() {
- goto l755
+ goto l766
}
- goto l756
- l755:
- position, tokenIndex = position755, tokenIndex755
+ goto l767
+ l766:
+ position, tokenIndex = position766, tokenIndex766
}
- l756:
+ l767:
add(ruleARMBaseIndexScale, position738)
}
return true
@@ -5843,566 +5957,567 @@
},
/* 47 ARMGOTLow12 <- <(':' ('g' / 'G') ('o' / 'O') ('t' / 'T') '_' ('l' / 'L') ('o' / 'O') '1' '2' ':' SymbolName)> */
func() bool {
- position757, tokenIndex757 := position, tokenIndex
+ position768, tokenIndex768 := position, tokenIndex
{
- position758 := position
+ position769 := position
if buffer[position] != rune(':') {
- goto l757
+ goto l768
}
position++
{
- position759, tokenIndex759 := position, tokenIndex
+ position770, tokenIndex770 := position, tokenIndex
if buffer[position] != rune('g') {
- goto l760
+ goto l771
}
position++
- goto l759
- l760:
- position, tokenIndex = position759, tokenIndex759
+ goto l770
+ l771:
+ position, tokenIndex = position770, tokenIndex770
if buffer[position] != rune('G') {
- goto l757
- }
- position++
- }
- l759:
- {
- position761, tokenIndex761 := position, tokenIndex
- if buffer[position] != rune('o') {
- goto l762
- }
- position++
- goto l761
- l762:
- position, tokenIndex = position761, tokenIndex761
- if buffer[position] != rune('O') {
- goto l757
- }
- position++
- }
- l761:
- {
- position763, tokenIndex763 := position, tokenIndex
- if buffer[position] != rune('t') {
- goto l764
- }
- position++
- goto l763
- l764:
- position, tokenIndex = position763, tokenIndex763
- if buffer[position] != rune('T') {
- goto l757
- }
- position++
- }
- l763:
- if buffer[position] != rune('_') {
- goto l757
- }
- position++
- {
- position765, tokenIndex765 := position, tokenIndex
- if buffer[position] != rune('l') {
- goto l766
- }
- position++
- goto l765
- l766:
- position, tokenIndex = position765, tokenIndex765
- if buffer[position] != rune('L') {
- goto l757
- }
- position++
- }
- l765:
- {
- position767, tokenIndex767 := position, tokenIndex
- if buffer[position] != rune('o') {
goto l768
}
position++
- goto l767
- l768:
- position, tokenIndex = position767, tokenIndex767
+ }
+ l770:
+ {
+ position772, tokenIndex772 := position, tokenIndex
+ if buffer[position] != rune('o') {
+ goto l773
+ }
+ position++
+ goto l772
+ l773:
+ position, tokenIndex = position772, tokenIndex772
if buffer[position] != rune('O') {
- goto l757
+ goto l768
}
position++
}
- l767:
+ l772:
+ {
+ position774, tokenIndex774 := position, tokenIndex
+ if buffer[position] != rune('t') {
+ goto l775
+ }
+ position++
+ goto l774
+ l775:
+ position, tokenIndex = position774, tokenIndex774
+ if buffer[position] != rune('T') {
+ goto l768
+ }
+ position++
+ }
+ l774:
+ if buffer[position] != rune('_') {
+ goto l768
+ }
+ position++
+ {
+ position776, tokenIndex776 := position, tokenIndex
+ if buffer[position] != rune('l') {
+ goto l777
+ }
+ position++
+ goto l776
+ l777:
+ position, tokenIndex = position776, tokenIndex776
+ if buffer[position] != rune('L') {
+ goto l768
+ }
+ position++
+ }
+ l776:
+ {
+ position778, tokenIndex778 := position, tokenIndex
+ if buffer[position] != rune('o') {
+ goto l779
+ }
+ position++
+ goto l778
+ l779:
+ position, tokenIndex = position778, tokenIndex778
+ if buffer[position] != rune('O') {
+ goto l768
+ }
+ position++
+ }
+ l778:
if buffer[position] != rune('1') {
- goto l757
+ goto l768
}
position++
if buffer[position] != rune('2') {
- goto l757
+ goto l768
}
position++
if buffer[position] != rune(':') {
- goto l757
+ goto l768
}
position++
if !_rules[ruleSymbolName]() {
- goto l757
+ goto l768
}
- add(ruleARMGOTLow12, position758)
+ add(ruleARMGOTLow12, position769)
}
return true
- l757:
- position, tokenIndex = position757, tokenIndex757
+ l768:
+ position, tokenIndex = position768, tokenIndex768
return false
},
/* 48 ARMPostincrement <- <'!'> */
func() bool {
- position769, tokenIndex769 := position, tokenIndex
+ position780, tokenIndex780 := position, tokenIndex
{
- position770 := position
+ position781 := position
if buffer[position] != rune('!') {
- goto l769
+ goto l780
}
position++
- add(ruleARMPostincrement, position770)
+ add(ruleARMPostincrement, position781)
}
return true
- l769:
- position, tokenIndex = position769, tokenIndex769
+ l780:
+ position, tokenIndex = position780, tokenIndex780
return false
},
/* 49 BaseIndexScale <- <('(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)?)? ')')> */
func() bool {
- position771, tokenIndex771 := position, tokenIndex
+ position782, tokenIndex782 := position, tokenIndex
{
- position772 := position
+ position783 := position
if buffer[position] != rune('(') {
- goto l771
+ goto l782
}
position++
{
- position773, tokenIndex773 := position, tokenIndex
+ position784, tokenIndex784 := position, tokenIndex
if !_rules[ruleRegisterOrConstant]() {
- goto l773
+ goto l784
}
- goto l774
- l773:
- position, tokenIndex = position773, tokenIndex773
+ goto l785
+ l784:
+ position, tokenIndex = position784, tokenIndex784
}
- l774:
+ l785:
{
- position775, tokenIndex775 := position, tokenIndex
+ position786, tokenIndex786 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l775
+ goto l786
}
- goto l776
- l775:
- position, tokenIndex = position775, tokenIndex775
+ goto l787
+ l786:
+ position, tokenIndex = position786, tokenIndex786
}
- l776:
+ l787:
{
- position777, tokenIndex777 := position, tokenIndex
+ position788, tokenIndex788 := position, tokenIndex
if buffer[position] != rune(',') {
- goto l777
+ goto l788
}
position++
{
- position779, tokenIndex779 := position, tokenIndex
+ position790, tokenIndex790 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l779
+ goto l790
}
- goto l780
- l779:
- position, tokenIndex = position779, tokenIndex779
+ goto l791
+ l790:
+ position, tokenIndex = position790, tokenIndex790
}
- l780:
+ l791:
if !_rules[ruleRegisterOrConstant]() {
- goto l777
+ goto l788
}
{
- position781, tokenIndex781 := position, tokenIndex
+ position792, tokenIndex792 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l781
+ goto l792
}
- goto l782
- l781:
- position, tokenIndex = position781, tokenIndex781
+ goto l793
+ l792:
+ position, tokenIndex = position792, tokenIndex792
}
- l782:
+ l793:
{
- position783, tokenIndex783 := position, tokenIndex
+ position794, tokenIndex794 := position, tokenIndex
if buffer[position] != rune(',') {
- goto l783
+ goto l794
}
position++
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l783
+ goto l794
}
position++
- l785:
+ l796:
{
- position786, tokenIndex786 := position, tokenIndex
+ position797, tokenIndex797 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l786
+ goto l797
}
position++
- goto l785
- l786:
- position, tokenIndex = position786, tokenIndex786
+ goto l796
+ l797:
+ position, tokenIndex = position797, tokenIndex797
}
- goto l784
- l783:
- position, tokenIndex = position783, tokenIndex783
+ goto l795
+ l794:
+ position, tokenIndex = position794, tokenIndex794
}
- l784:
- goto l778
- l777:
- position, tokenIndex = position777, tokenIndex777
+ l795:
+ goto l789
+ l788:
+ position, tokenIndex = position788, tokenIndex788
}
- l778:
+ l789:
if buffer[position] != rune(')') {
- goto l771
+ goto l782
}
position++
- add(ruleBaseIndexScale, position772)
+ add(ruleBaseIndexScale, position783)
}
return true
- l771:
- position, tokenIndex = position771, tokenIndex771
+ l782:
+ position, tokenIndex = position782, tokenIndex782
return false
},
/* 50 Operator <- <('+' / '-')> */
func() bool {
- position787, tokenIndex787 := position, tokenIndex
+ position798, tokenIndex798 := position, tokenIndex
{
- position788 := position
+ position799 := position
{
- position789, tokenIndex789 := position, tokenIndex
+ position800, tokenIndex800 := position, tokenIndex
if buffer[position] != rune('+') {
- goto l790
+ goto l801
}
position++
- goto l789
- l790:
- position, tokenIndex = position789, tokenIndex789
+ goto l800
+ l801:
+ position, tokenIndex = position800, tokenIndex800
if buffer[position] != rune('-') {
- goto l787
+ goto l798
}
position++
}
- l789:
- add(ruleOperator, position788)
+ l800:
+ add(ruleOperator, position799)
}
return true
- l787:
- position, tokenIndex = position787, tokenIndex787
+ l798:
+ position, tokenIndex = position798, tokenIndex798
return false
},
/* 51 Offset <- <('+'? '-'? (('0' ('b' / 'B') ('0' / '1')+) / ('0' ('x' / 'X') ([0-9] / [0-9] / ([a-f] / [A-F]))+) / [0-9]+))> */
func() bool {
- position791, tokenIndex791 := position, tokenIndex
+ position802, tokenIndex802 := position, tokenIndex
{
- position792 := position
+ position803 := position
{
- position793, tokenIndex793 := position, tokenIndex
+ position804, tokenIndex804 := position, tokenIndex
if buffer[position] != rune('+') {
- goto l793
+ goto l804
}
position++
- goto l794
- l793:
- position, tokenIndex = position793, tokenIndex793
+ goto l805
+ l804:
+ position, tokenIndex = position804, tokenIndex804
}
- l794:
+ l805:
{
- position795, tokenIndex795 := position, tokenIndex
+ position806, tokenIndex806 := position, tokenIndex
if buffer[position] != rune('-') {
- goto l795
+ goto l806
}
position++
- goto l796
- l795:
- position, tokenIndex = position795, tokenIndex795
+ goto l807
+ l806:
+ position, tokenIndex = position806, tokenIndex806
}
- l796:
+ l807:
{
- position797, tokenIndex797 := position, tokenIndex
+ position808, tokenIndex808 := position, tokenIndex
if buffer[position] != rune('0') {
- goto l798
+ goto l809
}
position++
{
- position799, tokenIndex799 := position, tokenIndex
+ position810, tokenIndex810 := position, tokenIndex
if buffer[position] != rune('b') {
- goto l800
+ goto l811
}
position++
- goto l799
- l800:
- position, tokenIndex = position799, tokenIndex799
+ goto l810
+ l811:
+ position, tokenIndex = position810, tokenIndex810
if buffer[position] != rune('B') {
- goto l798
- }
- position++
- }
- l799:
- {
- position803, tokenIndex803 := position, tokenIndex
- if buffer[position] != rune('0') {
- goto l804
- }
- position++
- goto l803
- l804:
- position, tokenIndex = position803, tokenIndex803
- if buffer[position] != rune('1') {
- goto l798
- }
- position++
- }
- l803:
- l801:
- {
- position802, tokenIndex802 := position, tokenIndex
- {
- position805, tokenIndex805 := position, tokenIndex
- if buffer[position] != rune('0') {
- goto l806
- }
- position++
- goto l805
- l806:
- position, tokenIndex = position805, tokenIndex805
- if buffer[position] != rune('1') {
- goto l802
- }
- position++
- }
- l805:
- goto l801
- l802:
- position, tokenIndex = position802, tokenIndex802
- }
- goto l797
- l798:
- position, tokenIndex = position797, tokenIndex797
- if buffer[position] != rune('0') {
- goto l807
- }
- position++
- {
- position808, tokenIndex808 := position, tokenIndex
- if buffer[position] != rune('x') {
goto l809
}
position++
- goto l808
- l809:
- position, tokenIndex = position808, tokenIndex808
- if buffer[position] != rune('X') {
- goto l807
- }
- position++
}
- l808:
- {
- position812, tokenIndex812 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l813
- }
- position++
- goto l812
- l813:
- position, tokenIndex = position812, tokenIndex812
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l814
- }
- position++
- goto l812
- l814:
- position, tokenIndex = position812, tokenIndex812
- {
- position815, tokenIndex815 := position, tokenIndex
- if c := buffer[position]; c < rune('a') || c > rune('f') {
- goto l816
- }
- position++
- goto l815
- l816:
- position, tokenIndex = position815, tokenIndex815
- if c := buffer[position]; c < rune('A') || c > rune('F') {
- goto l807
- }
- position++
- }
- l815:
- }
- l812:
l810:
{
- position811, tokenIndex811 := position, tokenIndex
- {
- position817, tokenIndex817 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l818
- }
- position++
- goto l817
- l818:
- position, tokenIndex = position817, tokenIndex817
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l819
- }
- position++
- goto l817
- l819:
- position, tokenIndex = position817, tokenIndex817
- {
- position820, tokenIndex820 := position, tokenIndex
- if c := buffer[position]; c < rune('a') || c > rune('f') {
- goto l821
- }
- position++
- goto l820
- l821:
- position, tokenIndex = position820, tokenIndex820
- if c := buffer[position]; c < rune('A') || c > rune('F') {
- goto l811
- }
- position++
- }
- l820:
+ position814, tokenIndex814 := position, tokenIndex
+ if buffer[position] != rune('0') {
+ goto l815
}
- l817:
- goto l810
- l811:
- position, tokenIndex = position811, tokenIndex811
+ position++
+ goto l814
+ l815:
+ position, tokenIndex = position814, tokenIndex814
+ if buffer[position] != rune('1') {
+ goto l809
+ }
+ position++
}
- goto l797
- l807:
- position, tokenIndex = position797, tokenIndex797
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l791
+ l814:
+ l812:
+ {
+ position813, tokenIndex813 := position, tokenIndex
+ {
+ position816, tokenIndex816 := position, tokenIndex
+ if buffer[position] != rune('0') {
+ goto l817
+ }
+ position++
+ goto l816
+ l817:
+ position, tokenIndex = position816, tokenIndex816
+ if buffer[position] != rune('1') {
+ goto l813
+ }
+ position++
+ }
+ l816:
+ goto l812
+ l813:
+ position, tokenIndex = position813, tokenIndex813
+ }
+ goto l808
+ l809:
+ position, tokenIndex = position808, tokenIndex808
+ if buffer[position] != rune('0') {
+ goto l818
}
position++
- l822:
+ {
+ position819, tokenIndex819 := position, tokenIndex
+ if buffer[position] != rune('x') {
+ goto l820
+ }
+ position++
+ goto l819
+ l820:
+ position, tokenIndex = position819, tokenIndex819
+ if buffer[position] != rune('X') {
+ goto l818
+ }
+ position++
+ }
+ l819:
{
position823, tokenIndex823 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l823
+ goto l824
}
position++
- goto l822
- l823:
+ goto l823
+ l824:
position, tokenIndex = position823, tokenIndex823
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l825
+ }
+ position++
+ goto l823
+ l825:
+ position, tokenIndex = position823, tokenIndex823
+ {
+ position826, tokenIndex826 := position, tokenIndex
+ if c := buffer[position]; c < rune('a') || c > rune('f') {
+ goto l827
+ }
+ position++
+ goto l826
+ l827:
+ position, tokenIndex = position826, tokenIndex826
+ if c := buffer[position]; c < rune('A') || c > rune('F') {
+ goto l818
+ }
+ position++
+ }
+ l826:
+ }
+ l823:
+ l821:
+ {
+ position822, tokenIndex822 := position, tokenIndex
+ {
+ position828, tokenIndex828 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l829
+ }
+ position++
+ goto l828
+ l829:
+ position, tokenIndex = position828, tokenIndex828
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l830
+ }
+ position++
+ goto l828
+ l830:
+ position, tokenIndex = position828, tokenIndex828
+ {
+ position831, tokenIndex831 := position, tokenIndex
+ if c := buffer[position]; c < rune('a') || c > rune('f') {
+ goto l832
+ }
+ position++
+ goto l831
+ l832:
+ position, tokenIndex = position831, tokenIndex831
+ if c := buffer[position]; c < rune('A') || c > rune('F') {
+ goto l822
+ }
+ position++
+ }
+ l831:
+ }
+ l828:
+ goto l821
+ l822:
+ position, tokenIndex = position822, tokenIndex822
+ }
+ goto l808
+ l818:
+ position, tokenIndex = position808, tokenIndex808
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l802
+ }
+ position++
+ l833:
+ {
+ position834, tokenIndex834 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l834
+ }
+ position++
+ goto l833
+ l834:
+ position, tokenIndex = position834, tokenIndex834
}
}
- l797:
- add(ruleOffset, position792)
+ l808:
+ add(ruleOffset, position803)
}
return true
- l791:
- position, tokenIndex = position791, tokenIndex791
+ l802:
+ position, tokenIndex = position802, tokenIndex802
return false
},
/* 52 Section <- <([a-z] / [A-Z] / '@')+> */
func() bool {
- position824, tokenIndex824 := position, tokenIndex
+ position835, tokenIndex835 := position, tokenIndex
{
- position825 := position
+ position836 := position
{
- position828, tokenIndex828 := position, tokenIndex
+ position839, tokenIndex839 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l829
+ goto l840
}
position++
- goto l828
- l829:
- position, tokenIndex = position828, tokenIndex828
+ goto l839
+ l840:
+ position, tokenIndex = position839, tokenIndex839
if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l830
+ goto l841
}
position++
- goto l828
- l830:
- position, tokenIndex = position828, tokenIndex828
+ goto l839
+ l841:
+ position, tokenIndex = position839, tokenIndex839
if buffer[position] != rune('@') {
- goto l824
+ goto l835
}
position++
}
- l828:
- l826:
+ l839:
+ l837:
{
- position827, tokenIndex827 := position, tokenIndex
+ position838, tokenIndex838 := position, tokenIndex
{
- position831, tokenIndex831 := position, tokenIndex
+ position842, tokenIndex842 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l832
+ goto l843
}
position++
- goto l831
- l832:
- position, tokenIndex = position831, tokenIndex831
+ goto l842
+ l843:
+ position, tokenIndex = position842, tokenIndex842
if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l833
+ goto l844
}
position++
- goto l831
- l833:
- position, tokenIndex = position831, tokenIndex831
+ goto l842
+ l844:
+ position, tokenIndex = position842, tokenIndex842
if buffer[position] != rune('@') {
- goto l827
+ goto l838
}
position++
}
- l831:
- goto l826
- l827:
- position, tokenIndex = position827, tokenIndex827
+ l842:
+ goto l837
+ l838:
+ position, tokenIndex = position838, tokenIndex838
}
- add(ruleSection, position825)
+ add(ruleSection, position836)
}
return true
- l824:
- position, tokenIndex = position824, tokenIndex824
+ l835:
+ position, tokenIndex = position835, tokenIndex835
return false
},
/* 53 SegmentRegister <- <('%' ([c-g] / 's') ('s' ':'))> */
func() bool {
- position834, tokenIndex834 := position, tokenIndex
+ position845, tokenIndex845 := position, tokenIndex
{
- position835 := position
+ position846 := position
if buffer[position] != rune('%') {
- goto l834
+ goto l845
}
position++
{
- position836, tokenIndex836 := position, tokenIndex
+ position847, tokenIndex847 := position, tokenIndex
if c := buffer[position]; c < rune('c') || c > rune('g') {
- goto l837
+ goto l848
}
position++
- goto l836
- l837:
- position, tokenIndex = position836, tokenIndex836
+ goto l847
+ l848:
+ position, tokenIndex = position847, tokenIndex847
if buffer[position] != rune('s') {
- goto l834
+ goto l845
}
position++
}
- l836:
+ l847:
if buffer[position] != rune('s') {
- goto l834
+ goto l845
}
position++
if buffer[position] != rune(':') {
- goto l834
+ goto l845
}
position++
- add(ruleSegmentRegister, position835)
+ add(ruleSegmentRegister, position846)
}
return true
- l834:
- position, tokenIndex = position834, tokenIndex834
+ l845:
+ position, tokenIndex = position845, tokenIndex845
return false
},
}
p.rules = _rules
+ return nil
}