P-256 assembly optimisations for Aarch64.

The ARMv8 assembly code in this commit is mostly taken from OpenSSL's `ecp_nistz256-armv8.pl` at https://github.com/openssl/openssl/blob/19e277dd19f2897f6a7b7eb236abe46655e575bf/crypto/ec/asm/ecp_nistz256-armv8.pl (see Note 1), adapting it to the implementation in p256-x86_64.c.

Most of the assembly functions found in `crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl` required to support that code have their analogous functions in the imported OpenSSL ARMv8 Perl assembly implementation with the exception of the functions:
- ecp_nistz256_select_w5
- ecp_nistz256_select_w7
An implementation for these functions was added.

Summary of modifications to the imported code:
* Renamed to `p256-armv8-asm.pl`
* Modified the location of `arm-xlate.pl` and `arm_arch.h`
* Replaced the `scatter-gather subroutines` with `select subroutines`. The `select subroutines` are implemented for ARMv8 similarly to their x86_64 counterparts, `ecp_nistz256_select_w5` and `ecp_nistz256_select_w7`.
* `ecp_nistz256_add` is removed because it was conflicting during the static build with the function of the same name in p256-nistz.c. The latter calls another assembly function, `ecp_nistz256_point_add`.
* `__ecp_nistz256_add` renamed to `__ecp_nistz256_add_to` to avoid the conflict with the function `ecp_nistz256_add` during the static build.
* l. 924 `add	sp,sp,#256` the calculation of the constant, 32*(12-4), is not left for the assembler to perform.

Other modifications:
* `beeu_mod_inverse_vartime()` was implemented for AArch64 in `p256_beeu-armv8-asm.pl` similarly to its implementation in `p256_beeu-x86_64-asm.pl`.
* The files containing `p256-x86_64` in their name were renamed to, `p256-nistz` since the functions and tests defined in them are hereby running on ARMv8 as well, if enabled.
* Updated `delocate.go` and `delocate.peg` to handle the offset calculation in the assembly instructions.
* Regenerated `delocate.peg.go`.

Notes:
1- The last commit in the history of the file is in master only, the previous commits are in OpenSSL 3.0.1
2- This change focuses on AArch64 (64-bit architecture of ARMv8). It does not support ARMv4 or ARMv7.

Testing the performance on Armv8 platform using -DCMAKE_BUILD_TYPE=Release:
Before:
```
Did 2596 ECDH P-256 operations in 1093956us (2373.0 ops/sec)
Did 6996 ECDSA P-256 signing operations in 1044630us (6697.1 ops/sec)
Did 2970 ECDSA P-256 verify operations in 1084848us (2737.7 ops/sec)
```
After:
```
Did 6699 ECDH P-256 operations in 1091684us (6136.4 ops/sec)
Did 20000 ECDSA P-256 signing operations in 1012944us (19744.4 ops/sec)
Did 7051 ECDSA P-256 verify operations in 1060000us (6651.9 ops/sec)
```

Change-Id: I9fdef12db365967a9264b5b32c07967b55ea48bd
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/51805
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index 312c080..1cb68ca 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -518,7 +518,7 @@
   fipsmodule/aes/aes_test.cc
   fipsmodule/bn/bn_test.cc
   fipsmodule/ec/ec_test.cc
-  fipsmodule/ec/p256-x86_64_test.cc
+  fipsmodule/ec/p256-nistz_test.cc
   fipsmodule/ecdsa/ecdsa_test.cc
   fipsmodule/md5/md5_test.cc
   fipsmodule/modes/gcm_test.cc
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 73f8a02..b99ebc7 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt
@@ -64,6 +64,8 @@
     armv8-mont.${ASM_EXT}
     ghash-neon-armv8.${ASM_EXT}
     ghashv8-armx.${ASM_EXT}
+    p256-armv8-asm.${ASM_EXT}
+    p256_beeu-armv8-asm.${ASM_EXT}
     sha1-armv8.${ASM_EXT}
     sha256-armv8.${ASM_EXT}
     sha512-armv8.${ASM_EXT}
@@ -102,6 +104,8 @@
 perlasm(md5-x86_64.${ASM_EXT} md5/asm/md5-x86_64.pl)
 perlasm(p256-x86_64-asm.${ASM_EXT} ec/asm/p256-x86_64-asm.pl)
 perlasm(p256_beeu-x86_64-asm.${ASM_EXT} ec/asm/p256_beeu-x86_64-asm.pl)
+perlasm(p256-armv8-asm.${ASM_EXT} ec/asm/p256-armv8-asm.pl)
+perlasm(p256_beeu-armv8-asm.${ASM_EXT} ec/asm/p256_beeu-armv8-asm.pl)
 perlasm(rdrand-x86_64.${ASM_EXT} rand/asm/rdrand-x86_64.pl)
 perlasm(rsaz-avx2.${ASM_EXT} bn/asm/rsaz-avx2.pl)
 perlasm(sha1-586.${ASM_EXT} sha/asm/sha1-586.pl)
diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c
index 6f8f5c0..87618fe 100644
--- a/crypto/fipsmodule/bcm.c
+++ b/crypto/fipsmodule/bcm.c
@@ -71,7 +71,7 @@
 #include "ec/oct.c"
 #include "ec/p224-64.c"
 #include "ec/p256.c"
-#include "ec/p256-x86_64.c"
+#include "ec/p256-nistz.c"
 #include "ec/scalar.c"
 #include "ec/simple.c"
 #include "ec/simple_mul.c"
diff --git a/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
new file mode 100644
index 0000000..f2926b8
--- /dev/null
+++ b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
@@ -0,0 +1,1702 @@
+#! /usr/bin/env perl
+# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# ECP_NISTZ256 module for ARMv8.
+#
+# February 2015.
+#
+# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
+# http://eprint.iacr.org/2013/816.
+#
+#			with/without -DECP_NISTZ256_ASM
+# Apple A7		+190-360%
+# Cortex-A53		+190-400%
+# Cortex-A57		+190-350%
+# Denver		+230-400%
+#
+# Ranges denote minimum and maximum improvement coefficients depending
+# on benchmark. Lower coefficients are for ECDSA sign, server-side
+# operation. Keep in mind that +400% means 5x improvement.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+{
+my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
+    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
+    map("x$_",(0..17,19,20));
+
+my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
+
+$code.=<<___;
+#include "openssl/arm_arch.h"
+
+.text
+.align	5
+.Lpoly:
+.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
+.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+.Lone_mont:
+.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+.Lone:
+.quad	1,0,0,0
+.Lord:
+.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
+.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+
+// void	ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_to_mont
+.type	ecp_nistz256_to_mont,%function
+.align	6
+ecp_nistz256_to_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	$bi,.LRR		// bp[0]
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+	adr	$bp,.LRR		// &bp[0]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
+
+// void	ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_from_mont
+.type	ecp_nistz256_from_mont,%function
+.align	4
+ecp_nistz256_from_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	mov	$bi,#1			// bp[0]
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+	adr	$bp,.Lone		// &bp[0]
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
+
+// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+//					     const BN_ULONG x2[4]);
+.globl	ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont,%function
+.align	4
+ecp_nistz256_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldr	$bi,[$bp]		// bp[0]
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+
+	bl	__ecp_nistz256_mul_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont,%function
+.align	4
+ecp_nistz256_sqr_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-32]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+
+	bl	__ecp_nistz256_sqr_mont
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x29,x30,[sp],#32
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_div_by_2
+.type	ecp_nistz256_div_by_2,%function
+.align	4
+ecp_nistz256_div_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	$acc0,$acc1,[$ap]
+	ldp	$acc2,$acc3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+
+	bl	__ecp_nistz256_div_by_2
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_2
+.type	ecp_nistz256_mul_by_2,%function
+.align	4
+ecp_nistz256_mul_by_2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	$acc0,$acc1,[$ap]
+	ldp	$acc2,$acc3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_mul_by_3
+.type	ecp_nistz256_mul_by_3,%function
+.align	4
+ecp_nistz256_mul_by_3:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	$acc0,$acc1,[$ap]
+	ldp	$acc2,$acc3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	mov	$a0,$acc0
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+
+	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
+
+	mov	$t0,$a0
+	mov	$t1,$a1
+	mov	$t2,$a2
+	mov	$t3,$a3
+
+	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+//				        const BN_ULONG x2[4]);
+.globl	ecp_nistz256_sub
+.type	ecp_nistz256_sub,%function
+.align	4
+ecp_nistz256_sub:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ldp	$acc0,$acc1,[$ap]
+	ldp	$acc2,$acc3,[$ap,#16]
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_sub,.-ecp_nistz256_sub
+
+// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl	ecp_nistz256_neg
+.type	ecp_nistz256_neg,%function
+.align	4
+ecp_nistz256_neg:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$bp,$ap
+	mov	$acc0,xzr		// a = 0
+	mov	$acc1,xzr
+	mov	$acc2,xzr
+	mov	$acc3,xzr
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+
+	bl	__ecp_nistz256_sub_from
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to $a0-$a3 and b[0] - to $bi
+.type	__ecp_nistz256_mul_mont,%function
+.align	4
+__ecp_nistz256_mul_mont:
+	mul	$acc0,$a0,$bi		// a[0]*b[0]
+	umulh	$t0,$a0,$bi
+
+	mul	$acc1,$a1,$bi		// a[1]*b[0]
+	umulh	$t1,$a1,$bi
+
+	mul	$acc2,$a2,$bi		// a[2]*b[0]
+	umulh	$t2,$a2,$bi
+
+	mul	$acc3,$a3,$bi		// a[3]*b[0]
+	umulh	$t3,$a3,$bi
+	ldr	$bi,[$bp,#8]		// b[1]
+
+	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
+	 lsl	$t0,$acc0,#32
+	adcs	$acc2,$acc2,$t1
+	 lsr	$t1,$acc0,#32
+	adcs	$acc3,$acc3,$t2
+	adc	$acc4,xzr,$t3
+	mov	$acc5,xzr
+___
+for($i=1;$i<4;$i++) {
+        # Reduction iteration is normally performed by accumulating
+        # result of multiplication of modulus by "magic" digit [and
+        # omitting least significant word, which is guaranteed to
+        # be 0], but thanks to special form of modulus and "magic"
+        # digit being equal to least significant word, it can be
+        # performed with additions and subtractions alone. Indeed:
+        #
+        #            ffff0001.00000000.0000ffff.ffffffff
+        # *                                     abcdefgh
+        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
+        #
+        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+        # rewrite above as:
+        #
+        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
+        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
+        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
+        #
+        # or marking redundant operations:
+        #
+        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
+        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
+        # - 0000abcd.efgh0000.--------.--------.--------
+
+$code.=<<___;
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
+	adcs	$acc1,$acc2,$t1
+	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
+	adcs	$acc3,$acc4,$t3
+	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
+	adc	$acc4,$acc5,xzr
+
+	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
+	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
+	adcs	$acc1,$acc1,$t1
+	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
+	adcs	$acc2,$acc2,$t2
+	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
+	adcs	$acc3,$acc3,$t3
+	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
+	adc	$acc4,$acc4,xzr
+___
+$code.=<<___	if ($i<3);
+	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
+___
+$code.=<<___;
+	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
+	 lsl	$t0,$acc0,#32
+	adcs	$acc2,$acc2,$t1
+	 lsr	$t1,$acc0,#32
+	adcs	$acc3,$acc3,$t2
+	adcs	$acc4,$acc4,$t3
+	adc	$acc5,xzr,xzr
+___
+}
+$code.=<<___;
+	// last reduction
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	adcs	$acc3,$acc4,$t3
+	adc	$acc4,$acc5,xzr
+
+	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
+	sbcs	$t1,$acc1,$poly1
+	sbcs	$t2,$acc2,xzr
+	sbcs	$t3,$acc3,$poly3
+	sbcs	xzr,$acc4,xzr		// did it borrow?
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to $a0-$a3
+.type	__ecp_nistz256_sqr_mont,%function
+.align	4
+__ecp_nistz256_sqr_mont:
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	$acc1,$a1,$a0		// a[1]*a[0]
+	umulh	$t1,$a1,$a0
+	mul	$acc2,$a2,$a0		// a[2]*a[0]
+	umulh	$t2,$a2,$a0
+	mul	$acc3,$a3,$a0		// a[3]*a[0]
+	umulh	$acc4,$a3,$a0
+
+	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
+	 mul	$t0,$a2,$a1		// a[2]*a[1]
+	 umulh	$t1,$a2,$a1
+	adcs	$acc3,$acc3,$t2
+	 mul	$t2,$a3,$a1		// a[3]*a[1]
+	 umulh	$t3,$a3,$a1
+	adc	$acc4,$acc4,xzr		// can't overflow
+
+	mul	$acc5,$a3,$a2		// a[3]*a[2]
+	umulh	$acc6,$a3,$a2
+
+	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
+	 mul	$acc0,$a0,$a0		// a[0]*a[0]
+	adc	$t2,$t3,xzr		// can't overflow
+
+	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
+	 umulh	$a0,$a0,$a0
+	adcs	$acc4,$acc4,$t1
+	 mul	$t1,$a1,$a1		// a[1]*a[1]
+	adcs	$acc5,$acc5,$t2
+	 umulh	$a1,$a1,$a1
+	adc	$acc6,$acc6,xzr		// can't overflow
+
+	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
+	 mul	$t2,$a2,$a2		// a[2]*a[2]
+	adcs	$acc2,$acc2,$acc2
+	 umulh	$a2,$a2,$a2
+	adcs	$acc3,$acc3,$acc3
+	 mul	$t3,$a3,$a3		// a[3]*a[3]
+	adcs	$acc4,$acc4,$acc4
+	 umulh	$a3,$a3,$a3
+	adcs	$acc5,$acc5,$acc5
+	adcs	$acc6,$acc6,$acc6
+	adc	$acc7,xzr,xzr
+
+	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$a1
+	adcs	$acc4,$acc4,$t2
+	adcs	$acc5,$acc5,$a2
+	 lsl	$t0,$acc0,#32
+	adcs	$acc6,$acc6,$t3
+	 lsr	$t1,$acc0,#32
+	adc	$acc7,$acc7,$a3
+___
+for($i=0;$i<3;$i++) {			# reductions, see commentary in
+					# multiplication for details
+$code.=<<___;
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	adcs	$acc1,$acc2,$t1
+	 lsl	$t0,$acc0,#32
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	 lsr	$t1,$acc0,#32
+	adc	$acc3,$t3,xzr		// can't overflow
+___
+}
+$code.=<<___;
+	subs	$t2,$acc0,$t0		// "*0xffff0001"
+	sbc	$t3,$acc0,$t1
+	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
+	adcs	$acc1,$acc2,$t1
+	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
+	adc	$acc3,$t3,xzr		// can't overflow
+
+	adds	$acc0,$acc0,$acc4	// accumulate upper half
+	adcs	$acc1,$acc1,$acc5
+	adcs	$acc2,$acc2,$acc6
+	adcs	$acc3,$acc3,$acc7
+	adc	$acc4,xzr,xzr
+
+	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
+	sbcs	$t1,$acc1,$poly1
+	sbcs	$t2,$acc2,xzr
+	sbcs	$t3,$acc3,$poly3
+	sbcs	xzr,$acc4,xzr		// did it borrow?
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.type	__ecp_nistz256_add_to,%function
+.align	4
+__ecp_nistz256_add_to:
+	adds	$acc0,$acc0,$t0		// ret = a+b
+	adcs	$acc1,$acc1,$t1
+	adcs	$acc2,$acc2,$t2
+	adcs	$acc3,$acc3,$t3
+	adc	$ap,xzr,xzr		// zap $ap
+
+	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
+	sbcs	$t1,$acc1,$poly1
+	sbcs	$t2,$acc2,xzr
+	sbcs	$t3,$acc3,$poly3
+	sbcs	xzr,$ap,xzr		// did subtraction borrow?
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
+
+.type	__ecp_nistz256_sub_from,%function
+.align	4
+__ecp_nistz256_sub_from:
+	ldp	$t0,$t1,[$bp]
+	ldp	$t2,$t3,[$bp,#16]
+	subs	$acc0,$acc0,$t0		// ret = a-b
+	sbcs	$acc1,$acc1,$t1
+	sbcs	$acc2,$acc2,$t2
+	sbcs	$acc3,$acc3,$t3
+	sbc	$ap,xzr,xzr		// zap $ap
+
+	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
+	adcs	$t1,$acc1,$poly1
+	adcs	$t2,$acc2,xzr
+	adc	$t3,$acc3,$poly3
+	cmp	$ap,xzr			// did subtraction borrow?
+
+	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
+	csel	$acc1,$acc1,$t1,eq
+	csel	$acc2,$acc2,$t2,eq
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,eq
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
+
+.type	__ecp_nistz256_sub_morf,%function
+.align	4
+__ecp_nistz256_sub_morf:
+	ldp	$t0,$t1,[$bp]
+	ldp	$t2,$t3,[$bp,#16]
+	subs	$acc0,$t0,$acc0		// ret = b-a
+	sbcs	$acc1,$t1,$acc1
+	sbcs	$acc2,$t2,$acc2
+	sbcs	$acc3,$t3,$acc3
+	sbc	$ap,xzr,xzr		// zap $ap
+
+	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
+	adcs	$t1,$acc1,$poly1
+	adcs	$t2,$acc2,xzr
+	adc	$t3,$acc3,$poly3
+	cmp	$ap,xzr			// did subtraction borrow?
+
+	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
+	csel	$acc1,$acc1,$t1,eq
+	csel	$acc2,$acc2,$t2,eq
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,eq
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
+
+.type	__ecp_nistz256_div_by_2,%function
+.align	4
+__ecp_nistz256_div_by_2:
+	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
+	adcs	$t1,$acc1,$poly1
+	adcs	$t2,$acc2,xzr
+	adcs	$t3,$acc3,$poly3
+	adc	$ap,xzr,xzr		// zap $ap
+	tst	$acc0,#1		// is a even?
+
+	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
+	csel	$acc1,$acc1,$t1,eq
+	csel	$acc2,$acc2,$t2,eq
+	csel	$acc3,$acc3,$t3,eq
+	csel	$ap,xzr,$ap,eq
+
+	lsr	$acc0,$acc0,#1		// ret >>= 1
+	orr	$acc0,$acc0,$acc1,lsl#63
+	lsr	$acc1,$acc1,#1
+	orr	$acc1,$acc1,$acc2,lsl#63
+	lsr	$acc2,$acc2,#1
+	orr	$acc2,$acc2,$acc3,lsl#63
+	lsr	$acc3,$acc3,#1
+	stp	$acc0,$acc1,[$rp]
+	orr	$acc3,$acc3,$ap,lsl#63
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ret
+.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
+___
+########################################################################
+# following subroutines are "literal" implementation of those found in
+# ecp_nistz256.c
+#
+########################################################################
+# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
+#
+{
+my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
+# above map() describes stack layout with 4 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real) = map("x$_",(21,22));
+
+$code.=<<___;
+.globl	ecp_nistz256_point_double
+.type	ecp_nistz256_point_double,%function
+.align	5
+ecp_nistz256_point_double:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	sub	sp,sp,#32*4
+
+.Ldouble_shortcut:
+	ldp	$acc0,$acc1,[$ap,#32]
+	 mov	$rp_real,$rp
+	ldp	$acc2,$acc3,[$ap,#48]
+	 mov	$ap_real,$ap
+	 ldr	$poly1,.Lpoly+8
+	mov	$t0,$acc0
+	 ldr	$poly3,.Lpoly+24
+	mov	$t1,$acc1
+	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	 ldp	$a2,$a3,[$ap_real,#64+16]
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
+
+	add	$rp,sp,#$Zsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
+
+	ldp	$t0,$t1,[$ap_real]
+	ldp	$t2,$t3,[$ap_real,#16]
+	mov	$a0,$acc0		// put Zsqr aside for p256_sub
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	add	$rp,sp,#$M
+	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
+
+	add	$bp,$ap_real,#0
+	mov	$acc0,$a0		// restore Zsqr
+	mov	$acc1,$a1
+	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
+	mov	$acc2,$a2
+	mov	$acc3,$a3
+	 ldp	$a2,$a3,[sp,#$S+16]
+	add	$rp,sp,#$Zsqr
+	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
+
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
+
+	ldr	$bi,[$ap_real,#32]
+	ldp	$a0,$a1,[$ap_real,#64]
+	ldp	$a2,$a3,[$ap_real,#64+16]
+	add	$bp,$ap_real,#32
+	add	$rp,sp,#$tmp0
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	 ldp	$a2,$a3,[sp,#$S+16]
+	add	$rp,$rp_real,#64
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
+
+	add	$rp,sp,#$tmp0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
+
+	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$M]
+	 ldp	$a2,$a3,[sp,#$M+16]
+	add	$rp,$rp_real,#32
+	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
+
+	add	$bp,sp,#$Zsqr
+	add	$rp,sp,#$M
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
+
+	mov	$t0,$acc0		// duplicate M
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	mov	$a0,$acc0		// put M aside
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	add	$rp,sp,#$M
+	bl	__ecp_nistz256_add_to
+	mov	$t0,$a0			// restore M
+	mov	$t1,$a1
+	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
+	mov	$t2,$a2
+	 ldp	$a0,$a1,[sp,#$S]
+	mov	$t3,$a3
+	 ldp	$a2,$a3,[sp,#$S+16]
+	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
+
+	add	$bp,$ap_real,#0
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	 ldp	$a2,$a3,[sp,#$M+16]
+	add	$rp,sp,#$tmp0
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
+
+	add	$rp,$rp_real,#0
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
+
+	add	$bp,sp,#$tmp0
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
+
+	add	$bp,sp,#$S
+	add	$rp,sp,#$S
+	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
+
+	ldr	$bi,[sp,#$M]
+	mov	$a0,$acc0		// copy S
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	add	$bp,sp,#$M
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
+
+	add	$bp,$rp_real,#32
+	add	$rp,$rp_real,#32
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+___
+}
+
+########################################################################
+# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
+#			      const P256_POINT *in2);
+{
+my ($res_x,$res_y,$res_z,
+    $H,$Hsqr,$R,$Rsqr,$Hcub,
+    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
+my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
+# above map() describes stack layout with 12 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
+
+$code.=<<___;
+.globl	ecp_nistz256_point_add
+.type	ecp_nistz256_point_add,%function
+.align	5
+ecp_nistz256_point_add:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#32*12
+
+	ldp	$a0,$a1,[$bp,#64]	// in2_z
+	ldp	$a2,$a3,[$bp,#64+16]
+	 mov	$rp_real,$rp
+	 mov	$ap_real,$ap
+	 mov	$bp_real,$bp
+	 ldr	$poly1,.Lpoly+8
+	 ldr	$poly3,.Lpoly+24
+	orr	$t0,$a0,$a1
+	orr	$t2,$a2,$a3
+	orr	$in2infty,$t0,$t2
+	cmp	$in2infty,#0
+	csetm	$in2infty,ne		// ~in2infty
+	add	$rp,sp,#$Z2sqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
+
+	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
+	ldp	$a2,$a3,[$ap_real,#64+16]
+	orr	$t0,$a0,$a1
+	orr	$t2,$a2,$a3
+	orr	$in1infty,$t0,$t2
+	cmp	$in1infty,#0
+	csetm	$in1infty,ne		// ~in1infty
+	add	$rp,sp,#$Z1sqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	ldr	$bi,[$bp_real,#64]
+	ldp	$a0,$a1,[sp,#$Z2sqr]
+	ldp	$a2,$a3,[sp,#$Z2sqr+16]
+	add	$bp,$bp_real,#64
+	add	$rp,sp,#$S1
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
+
+	ldr	$bi,[$ap_real,#64]
+	ldp	$a0,$a1,[sp,#$Z1sqr]
+	ldp	$a2,$a3,[sp,#$Z1sqr+16]
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	$bi,[$ap_real,#32]
+	ldp	$a0,$a1,[sp,#$S1]
+	ldp	$a2,$a3,[sp,#$S1+16]
+	add	$bp,$ap_real,#32
+	add	$rp,sp,#$S1
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
+
+	ldr	$bi,[$bp_real,#32]
+	ldp	$a0,$a1,[sp,#$S2]
+	ldp	$a2,$a3,[sp,#$S2+16]
+	add	$bp,$bp_real,#32
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	$bp,sp,#$S1
+	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[$ap_real]
+	 ldp	$a2,$a3,[$ap_real,#16]
+	add	$rp,sp,#$R
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
+
+	orr	$acc0,$acc0,$acc1	// see if result is zero
+	orr	$acc2,$acc2,$acc3
+	orr	$temp0,$acc0,$acc2	// ~is_equal(S1,S2)
+
+	add	$bp,sp,#$Z2sqr
+	add	$rp,sp,#$U1
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
+
+	ldr	$bi,[sp,#$Z1sqr]
+	ldp	$a0,$a1,[$bp_real]
+	ldp	$a2,$a3,[$bp_real,#16]
+	add	$bp,sp,#$Z1sqr
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
+
+	add	$bp,sp,#$U1
+	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
+	 ldp	$a2,$a3,[sp,#$R+16]
+	add	$rp,sp,#$H
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
+
+	orr	$acc0,$acc0,$acc1	// see if result is zero
+	orr	$acc2,$acc2,$acc3
+	orr	$acc0,$acc0,$acc2	// ~is_equal(U1,U2)
+
+	mvn	$temp1,$in1infty	// -1/0 -> 0/-1
+	mvn	$temp2,$in2infty	// -1/0 -> 0/-1
+	orr	$acc0,$acc0,$temp1
+	orr	$acc0,$acc0,$temp2
+	orr	$acc0,$acc0,$temp0
+	cbnz	$acc0,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+.Ladd_double:
+	mov	$ap,$ap_real
+	mov	$rp,$rp_real
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
+	b	.Ldouble_shortcut
+
+.align	4
+.Ladd_proceed:
+	add	$rp,sp,#$Rsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	$bi,[$ap_real,#64]
+	ldp	$a0,$a1,[sp,#$H]
+	ldp	$a2,$a3,[sp,#$H+16]
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$res_z
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldp	$a0,$a1,[sp,#$H]
+	ldp	$a2,$a3,[sp,#$H+16]
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldr	$bi,[$bp_real,#64]
+	ldp	$a0,$a1,[sp,#$res_z]
+	ldp	$a2,$a3,[sp,#$res_z+16]
+	add	$bp,$bp_real,#64
+	add	$rp,sp,#$res_z
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
+
+	ldr	$bi,[sp,#$H]
+	ldp	$a0,$a1,[sp,#$Hsqr]
+	ldp	$a2,$a3,[sp,#$Hsqr+16]
+	add	$bp,sp,#$H
+	add	$rp,sp,#$Hcub
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	$bi,[sp,#$Hsqr]
+	ldp	$a0,$a1,[sp,#$U1]
+	ldp	$a2,$a3,[sp,#$U1+16]
+	add	$bp,sp,#$Hsqr
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	$bp,sp,#$Rsqr
+	add	$rp,sp,#$res_x
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	$bp,sp,#$Hcub
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	$bp,sp,#$U2
+	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$S1]
+	 ldp	$a2,$a3,[sp,#$S1+16]
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	$bp,sp,#$Hcub
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
+
+	ldr	$bi,[sp,#$R]
+	ldp	$a0,$a1,[sp,#$res_y]
+	ldp	$a2,$a3,[sp,#$res_y+16]
+	add	$bp,sp,#$R
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	$bp,sp,#$S2
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	$a0,$a1,[sp,#$res_x]		// res
+	ldp	$a2,$a3,[sp,#$res_x+16]
+	ldp	$t0,$t1,[$bp_real]		// in2
+	ldp	$t2,$t3,[$bp_real,#16]
+___
+for($i=0;$i<64;$i+=32) {		# conditional moves
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	ldp	$a2,$a3,[sp,#$res_x+$i+48]
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	ldp	$t2,$t3,[$bp_real,#$i+48]
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+___
+}
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+
+.Ladd_done:
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+___
+}
+
+########################################################################
+# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
+#				     const P256_POINT_AFFINE *in2);
+{
+my ($res_x,$res_y,$res_z,
+    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
+my $Z1sqr = $S2;
+# above map() describes stack layout with 10 temporary
+# 256-bit vectors on top.
+my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
+
+$code.=<<___;
+.globl	ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine,%function
+.align	5
+ecp_nistz256_point_add_affine:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	sub	sp,sp,#32*10
+
+	mov	$rp_real,$rp
+	mov	$ap_real,$ap
+	mov	$bp_real,$bp
+	ldr	$poly1,.Lpoly+8
+	ldr	$poly3,.Lpoly+24
+
+	ldp	$a0,$a1,[$ap,#64]	// in1_z
+	ldp	$a2,$a3,[$ap,#64+16]
+	orr	$t0,$a0,$a1
+	orr	$t2,$a2,$a3
+	orr	$in1infty,$t0,$t2
+	cmp	$in1infty,#0
+	csetm	$in1infty,ne		// ~in1infty
+
+	ldp	$acc0,$acc1,[$bp]	// in2_x
+	ldp	$acc2,$acc3,[$bp,#16]
+	ldp	$t0,$t1,[$bp,#32]	// in2_y
+	ldp	$t2,$t3,[$bp,#48]
+	orr	$acc0,$acc0,$acc1
+	orr	$acc2,$acc2,$acc3
+	orr	$t0,$t0,$t1
+	orr	$t2,$t2,$t3
+	orr	$acc0,$acc0,$acc2
+	orr	$t0,$t0,$t2
+	orr	$in2infty,$acc0,$t0
+	cmp	$in2infty,#0
+	csetm	$in2infty,ne		// ~in2infty
+
+	add	$rp,sp,#$Z1sqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
+
+	mov	$a0,$acc0
+	mov	$a1,$acc1
+	mov	$a2,$acc2
+	mov	$a3,$acc3
+	ldr	$bi,[$bp_real]
+	add	$bp,$bp_real,#0
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
+
+	add	$bp,$ap_real,#0
+	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$Z1sqr]
+	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
+	add	$rp,sp,#$H
+	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
+
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
+
+	ldr	$bi,[$ap_real,#64]
+	ldp	$a0,$a1,[sp,#$H]
+	ldp	$a2,$a3,[sp,#$H+16]
+	add	$bp,$ap_real,#64
+	add	$rp,sp,#$res_z
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
+
+	ldr	$bi,[$bp_real,#32]
+	ldp	$a0,$a1,[sp,#$S2]
+	ldp	$a2,$a3,[sp,#$S2+16]
+	add	$bp,$bp_real,#32
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
+
+	add	$bp,$ap_real,#32
+	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
+	 ldp	$a2,$a3,[sp,#$H+16]
+	add	$rp,sp,#$R
+	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
+
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
+
+	ldp	$a0,$a1,[sp,#$R]
+	ldp	$a2,$a3,[sp,#$R+16]
+	add	$rp,sp,#$Rsqr
+	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
+
+	ldr	$bi,[sp,#$H]
+	ldp	$a0,$a1,[sp,#$Hsqr]
+	ldp	$a2,$a3,[sp,#$Hsqr+16]
+	add	$bp,sp,#$H
+	add	$rp,sp,#$Hcub
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
+
+	ldr	$bi,[$ap_real]
+	ldp	$a0,$a1,[sp,#$Hsqr]
+	ldp	$a2,$a3,[sp,#$Hsqr+16]
+	add	$bp,$ap_real,#0
+	add	$rp,sp,#$U2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
+
+	mov	$t0,$acc0
+	mov	$t1,$acc1
+	mov	$t2,$acc2
+	mov	$t3,$acc3
+	add	$rp,sp,#$Hsqr
+	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
+
+	add	$bp,sp,#$Rsqr
+	add	$rp,sp,#$res_x
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
+
+	add	$bp,sp,#$Hcub
+	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
+
+	add	$bp,sp,#$U2
+	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
+	 ldp	$a0,$a1,[sp,#$Hcub]
+	 ldp	$a2,$a3,[sp,#$Hcub+16]
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
+
+	add	$bp,$ap_real,#32
+	add	$rp,sp,#$S2
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
+
+	ldr	$bi,[sp,#$R]
+	ldp	$a0,$a1,[sp,#$res_y]
+	ldp	$a2,$a3,[sp,#$res_y+16]
+	add	$bp,sp,#$R
+	add	$rp,sp,#$res_y
+	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
+
+	add	$bp,sp,#$S2
+	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
+
+	ldp	$a0,$a1,[sp,#$res_x]		// res
+	ldp	$a2,$a3,[sp,#$res_x+16]
+	ldp	$t0,$t1,[$bp_real]		// in2
+	ldp	$t2,$t3,[$bp_real,#16]
+___
+for($i=0;$i<64;$i+=32) {		# conditional moves
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	ldp	$a2,$a3,[sp,#$res_x+$i+48]
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	ldp	$t2,$t3,[$bp_real,#$i+48]
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+___
+$code.=<<___	if ($i == 0);
+	adr	$bp_real,.Lone_mont-64
+___
+}
+$code.=<<___;
+	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
+	cmp	$in1infty,#0			// ~$in1intfy, remember?
+	ldp	$acc2,$acc3,[$ap_real,#$i+16]
+	csel	$t0,$a0,$t0,ne
+	csel	$t1,$a1,$t1,ne
+	csel	$t2,$a2,$t2,ne
+	csel	$t3,$a3,$t3,ne
+	cmp	$in2infty,#0			// ~$in2intfy, remember?
+	csel	$acc0,$t0,$acc0,ne
+	csel	$acc1,$t1,$acc1,ne
+	csel	$acc2,$t2,$acc2,ne
+	csel	$acc3,$t3,$acc3,ne
+	stp	$acc0,$acc1,[$rp_real,#$i]
+	stp	$acc2,$acc3,[$rp_real,#$i+16]
+
+	add	sp,x29,#0		// destroy frame
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x29,x30,[sp],#80
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+___
+}
+if (1) {
+my ($ord0,$ord1) = ($poly1,$poly3);
+my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
+my $acc7 = $bi;
+
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+//                                uint64_t b[4]);
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,%function
+.align	4
+ecp_nistz256_ord_mul_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adr	$ordk,.Lord
+	ldr	$bi,[$bp]		// bp[0]
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+
+	ldp	$ord0,$ord1,[$ordk,#0]
+	ldp	$ord2,$ord3,[$ordk,#16]
+	ldr	$ordk,[$ordk,#32]
+
+	mul	$acc0,$a0,$bi		// a[0]*b[0]
+	umulh	$t0,$a0,$bi
+
+	mul	$acc1,$a1,$bi		// a[1]*b[0]
+	umulh	$t1,$a1,$bi
+
+	mul	$acc2,$a2,$bi		// a[2]*b[0]
+	umulh	$t2,$a2,$bi
+
+	mul	$acc3,$a3,$bi		// a[3]*b[0]
+	umulh	$acc4,$a3,$bi
+
+	mul	$t4,$acc0,$ordk
+
+	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$t2
+	adc	$acc4,$acc4,xzr
+	mov	$acc5,xzr
+___
+for ($i=1;$i<4;$i++) {
+	################################################################
+	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
+	# *                                     abcdefgh
+	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+	#
+	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
+	# rewrite above as:
+	#
+	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
+	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
+	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
+$code.=<<___;
+	ldr	$bi,[$bp,#8*$i]		// b[i]
+
+	lsl	$t0,$t4,#32
+	subs	$acc2,$acc2,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc3,$acc3,$t0
+	sbcs	$acc4,$acc4,$t1
+	sbc	$acc5,$acc5,xzr
+
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	 mul	$t0,$a0,$bi
+	adc	$t3,$t3,xzr
+	 mul	$t1,$a1,$bi
+
+	adds	$acc0,$acc1,$t2
+	 mul	$t2,$a2,$bi
+	adcs	$acc1,$acc2,$t3
+	 mul	$t3,$a3,$bi
+	adcs	$acc2,$acc3,$t4
+	adcs	$acc3,$acc4,$t4
+	adc	$acc4,$acc5,xzr
+
+	adds	$acc0,$acc0,$t0		// accumulate low parts
+	umulh	$t0,$a0,$bi
+	adcs	$acc1,$acc1,$t1
+	umulh	$t1,$a1,$bi
+	adcs	$acc2,$acc2,$t2
+	umulh	$t2,$a2,$bi
+	adcs	$acc3,$acc3,$t3
+	umulh	$t3,$a3,$bi
+	adc	$acc4,$acc4,xzr
+	mul	$t4,$acc0,$ordk
+	adds	$acc1,$acc1,$t0		// accumulate high parts
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$t2
+	adcs	$acc4,$acc4,$t3
+	adc	$acc5,xzr,xzr
+___
+}
+$code.=<<___;
+	lsl	$t0,$t4,#32		// last reduction
+	subs	$acc2,$acc2,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc3,$acc3,$t0
+	sbcs	$acc4,$acc4,$t1
+	sbc	$acc5,$acc5,xzr
+
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	adc	$t3,$t3,xzr
+
+	adds	$acc0,$acc1,$t2
+	adcs	$acc1,$acc2,$t3
+	adcs	$acc2,$acc3,$t4
+	adcs	$acc3,$acc4,$t4
+	adc	$acc4,$acc5,xzr
+
+	subs	$t0,$acc0,$ord0		// ret -= modulus
+	sbcs	$t1,$acc1,$ord1
+	sbcs	$t2,$acc2,$ord2
+	sbcs	$t3,$acc3,$ord3
+	sbcs	xzr,$acc4,xzr
+
+	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$acc1,$acc1,$t1,lo
+	csel	$acc2,$acc2,$t2,lo
+	stp	$acc0,$acc1,[$rp]
+	csel	$acc3,$acc3,$t3,lo
+	stp	$acc2,$acc3,[$rp,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+//                                int rep);
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,%function
+.align	4
+ecp_nistz256_ord_sqr_mont:
+	AARCH64_VALID_CALL_TARGET
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	adr	$ordk,.Lord
+	ldp	$a0,$a1,[$ap]
+	ldp	$a2,$a3,[$ap,#16]
+
+	ldp	$ord0,$ord1,[$ordk,#0]
+	ldp	$ord2,$ord3,[$ordk,#16]
+	ldr	$ordk,[$ordk,#32]
+	b	.Loop_ord_sqr
+
+.align	4
+.Loop_ord_sqr:
+	sub	$bp,$bp,#1
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	$acc1,$a1,$a0		// a[1]*a[0]
+	umulh	$t1,$a1,$a0
+	mul	$acc2,$a2,$a0		// a[2]*a[0]
+	umulh	$t2,$a2,$a0
+	mul	$acc3,$a3,$a0		// a[3]*a[0]
+	umulh	$acc4,$a3,$a0
+
+	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
+	 mul	$t0,$a2,$a1		// a[2]*a[1]
+	 umulh	$t1,$a2,$a1
+	adcs	$acc3,$acc3,$t2
+	 mul	$t2,$a3,$a1		// a[3]*a[1]
+	 umulh	$t3,$a3,$a1
+	adc	$acc4,$acc4,xzr		// can't overflow
+
+	mul	$acc5,$a3,$a2		// a[3]*a[2]
+	umulh	$acc6,$a3,$a2
+
+	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
+	 mul	$acc0,$a0,$a0		// a[0]*a[0]
+	adc	$t2,$t3,xzr		// can't overflow
+
+	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
+	 umulh	$a0,$a0,$a0
+	adcs	$acc4,$acc4,$t1
+	 mul	$t1,$a1,$a1		// a[1]*a[1]
+	adcs	$acc5,$acc5,$t2
+	 umulh	$a1,$a1,$a1
+	adc	$acc6,$acc6,xzr		// can't overflow
+
+	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
+	 mul	$t2,$a2,$a2		// a[2]*a[2]
+	adcs	$acc2,$acc2,$acc2
+	 umulh	$a2,$a2,$a2
+	adcs	$acc3,$acc3,$acc3
+	 mul	$t3,$a3,$a3		// a[3]*a[3]
+	adcs	$acc4,$acc4,$acc4
+	 umulh	$a3,$a3,$a3
+	adcs	$acc5,$acc5,$acc5
+	adcs	$acc6,$acc6,$acc6
+	adc	$acc7,xzr,xzr
+
+	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
+	 mul	$t4,$acc0,$ordk
+	adcs	$acc2,$acc2,$t1
+	adcs	$acc3,$acc3,$a1
+	adcs	$acc4,$acc4,$t2
+	adcs	$acc5,$acc5,$a2
+	adcs	$acc6,$acc6,$t3
+	adc	$acc7,$acc7,$a3
+___
+for($i=0; $i<4; $i++) {			# reductions
+$code.=<<___;
+	subs	xzr,$acc0,#1
+	umulh	$t1,$ord0,$t4
+	mul	$t2,$ord1,$t4
+	umulh	$t3,$ord1,$t4
+
+	adcs	$t2,$t2,$t1
+	adc	$t3,$t3,xzr
+
+	adds	$acc0,$acc1,$t2
+	adcs	$acc1,$acc2,$t3
+	adcs	$acc2,$acc3,$t4
+	adc	$acc3,xzr,$t4		// can't overflow
+___
+$code.=<<___	if ($i<3);
+	mul	$t3,$acc0,$ordk
+___
+$code.=<<___;
+	lsl	$t0,$t4,#32
+	subs	$acc1,$acc1,$t4
+	lsr	$t1,$t4,#32
+	sbcs	$acc2,$acc2,$t0
+	sbc	$acc3,$acc3,$t1		// can't borrow
+___
+	($t3,$t4) = ($t4,$t3);
+}
+$code.=<<___;
+	adds	$acc0,$acc0,$acc4	// accumulate upper half
+	adcs	$acc1,$acc1,$acc5
+	adcs	$acc2,$acc2,$acc6
+	adcs	$acc3,$acc3,$acc7
+	adc	$acc4,xzr,xzr
+
+	subs	$t0,$acc0,$ord0		// ret -= modulus
+	sbcs	$t1,$acc1,$ord1
+	sbcs	$t2,$acc2,$ord2
+	sbcs	$t3,$acc3,$ord3
+	sbcs	xzr,$acc4,xzr
+
+	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
+	csel	$a1,$acc1,$t1,lo
+	csel	$a2,$acc2,$t2,lo
+	csel	$a3,$acc3,$t3,lo
+
+	cbnz	$bp,.Loop_ord_sqr
+
+	stp	$a0,$a1,[$rp]
+	stp	$a2,$a3,[$rp,#16]
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+}	}
+
+########################################################################
+# select subroutines
+# These select functions are similar to those in p256-x86_64-asm.pl
+# They load all points in the lookup table
+# keeping in the output only the one corresponding to the input index.
+{
+my ($val,$in_t)=map("x$_",(0..1));
+my ($index)=("w2");
+my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11");
+my ($Mask)=("v3");
+my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21));
+my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27));
+$code.=<<___;
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5,%function
+.align	4
+ecp_nistz256_select_w5:
+    AARCH64_VALID_CALL_TARGET
+
+    // $Val_in := $val
+    // $Idx_ctr := 0; loop counter and incremented internal index
+    mov     $Val_in, $val
+    mov     $Idx_ctr, #0
+
+    // [$Ra-$Rf] := 0
+    movi    $Ra.16b, #0
+    movi    $Rb.16b, #0
+    movi    $Rc.16b, #0
+    movi    $Rd.16b, #0
+    movi    $Re.16b, #0
+    movi    $Rf.16b, #0
+
+.Lselect_w5_loop:
+    // Loop 16 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+    add $Idx_ctr, $Idx_ctr, #1
+
+    // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t
+    //  and advance $in_t to point to the next entry
+    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
+
+    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
+    cmp     $Idx_ctr, $index
+    csetm   $Mask_64, eq
+
+    // continue loading ...
+    ld1     {$T0e.2d, $T0f.2d}, [$in_t],#32
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+    dup     $Mask.2d, $Mask_64
+
+    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
+    // i.e., values in output registers will remain the same if $Idx_ctr != $index
+    bit     $Ra.16b, $T0a.16b, $Mask.16b
+    bit     $Rb.16b, $T0b.16b, $Mask.16b
+
+    bit     $Rc.16b, $T0c.16b, $Mask.16b
+    bit     $Rd.16b, $T0d.16b, $Mask.16b
+
+    bit     $Re.16b, $T0e.16b, $Mask.16b
+    bit     $Rf.16b, $T0f.16b, $Mask.16b
+
+    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+    tbz    $Idx_ctr, #4, .Lselect_w5_loop
+
+    // Write [$Ra-$Rf] to memory at the output pointer
+    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64
+    st1     {$Re.2d, $Rf.2d}, [$Val_in]
+
+	ret
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7,%function
+.align	4
+ecp_nistz256_select_w7:
+    AARCH64_VALID_CALL_TARGET
+
+    // $Idx_ctr := 0; loop counter and incremented internal index
+    mov     $Idx_ctr, #0
+
+    // [$Ra-$Rf] := 0
+    movi    $Ra.16b, #0
+    movi    $Rb.16b, #0
+    movi    $Rc.16b, #0
+    movi    $Rd.16b, #0
+
+.Lselect_w7_loop:
+    // Loop 64 times.
+
+    // Increment index (loop counter); tested at the end of the loop
+    add $Idx_ctr, $Idx_ctr, #1
+
+    // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t
+    //  and advance $in_t to point to the next entry
+    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
+
+    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
+    cmp     $Idx_ctr, $index
+    csetm   $Mask_64, eq
+
+    // duplicate mask_64 into Mask (all 0s or all 1s)
+    dup     $Mask.2d, $Mask_64
+
+    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
+    // i.e., values in output registers will remain the same if $Idx_ctr != $index
+    bit     $Ra.16b, $T0a.16b, $Mask.16b
+    bit     $Rb.16b, $T0b.16b, $Mask.16b
+
+    bit     $Rc.16b, $T0c.16b, $Mask.16b
+    bit     $Rd.16b, $T0d.16b, $Mask.16b
+
+    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+    tbz    $Idx_ctr, #6, .Lselect_w7_loop
+
+    // Write [$Ra-$Rd] to memory at the output pointer
+    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val]
+
+	ret
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!";	# enforce flush
diff --git a/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
new file mode 100644
index 0000000..e259aef
--- /dev/null
+++ b/crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl
@@ -0,0 +1,455 @@
+# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#
+#
+# This code is based on p256_beeu-x86_64-asm.pl (which is based on BN_mod_inverse_odd).
+#
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+#############################################################################
+# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+#                                     BN_ULONG a[P256_LIMBS],
+#                                     BN_ULONG n[P256_LIMBS]);
+#
+# (Binary Extended GCD (Euclidean) Algorithm.
+#  See A. Menezes, P. vanOorschot, and S. Vanstone's Handbook of Applied Cryptography,
+#  Chapter 14, Algorithm 14.61 and Note 14.64
+#  http://cacr.uwaterloo.ca/hac/about/chap14.pdf)
+
+# Assumption 1: n is odd for the BEEU
+# Assumption 2: 1 < a < n < 2^256
+
+# Details
+# The inverse of x modulo y can be calculated using Alg. 14.61, where "a" would be that inverse.
+# In other words,
+# ax == 1 (mod y) (where the symbol “==“ denotes ”congruent“)
+#  a == x^{-1} (mod y)
+#
+# It can be shown that throughout all the iterations of the algorithm, the following holds:
+#    u = Ax + By
+#    v = Cx + Dy
+# The values B and D are not of interest in this case, so they need not be computed by the algorithm.
+# This means the following congruences hold through the iterations of the algorithm.
+#    Ax == u (mod y)
+#    Cx == v (mod y)
+
+# Now we will modify the notation to match that of BN_mod_inverse_odd()
+# on which beeu_mod_inverse_vartime() in `p256_beeu-x86_64-asm` is based.
+# In those functions:
+#    x, y -> a, n
+#    u, v -> B, A
+#    A, C -> X, Y’, where Y’ = -Y
+# Hence, the following holds throughout the algorithm iterations
+#    Xa == B (mod n)
+#   -Ya == A (mod n)
+#
+# Same algorithm in Python:
+# def beeu(a, n):
+#     X = 1
+#     Y = 0
+#     B = a
+#     A = n
+#     while (B != 0):
+#         while (B % 2) == 0:
+#             B >>= 1
+#             if (X % 2) == 1:
+#                 X = X + n
+#             X >>= 1
+#         while (A % 2) == 0:
+#             A >>= 1
+#             if (Y % 2) == 1:
+#                 Y = Y + n
+#             Y >>= 1
+#         if (B >= A):
+#             B = B - A
+#             X = X + Y
+#         else:
+#             A = A - B
+#             Y = Y + X
+#     if (A != 1):
+#         # error
+#         return 0
+#     else:
+#         while (Y > n):
+#             Y = Y - n
+#         Y = n - Y
+#         return Y
+
+
+# For the internal variables,
+# x0-x2, x30 are used to hold the modulus n. The input parameters passed in
+# x1,x2 are copied first before corrupting them. x0 (out) is stored on the stack.
+# x3-x7 are used for parameters, which is not the case in this function, so they are corruptible
+# x8 is corruptible here
+# (the function doesn't return a struct, hence x8 doesn't contain a passed-in address
+#  for that struct).
+# x9-x15 are corruptible registers
+# x19-x28 are callee-saved registers
+
+# X/Y will hold the inverse parameter
+# Assumption: a,n,X,Y < 2^(256)
+# Initially, X := 1, Y := 0
+#            A := n, B := a
+
+# Function parameters (as per the Procedure Call Standard)
+my($out, $a_in, $n_in)=map("x$_",(0..2));
+# Internal variables
+my($n0, $n1, $n2, $n3)=map("x$_",(0..2,30));
+my($x0, $x1, $x2, $x3, $x4)=map("x$_",(3..7));
+my($y0, $y1, $y2, $y3, $y4)=map("x$_",(8..12));
+my($shift)=("x13");
+my($t0, $t1, $t2, $t3)=map("x$_",(14,15,19,20));
+my($a0, $a1, $a2, $a3)=map("x$_",(21..24));
+my($b0, $b1, $b2, $b3)=map("x$_",(25..28));
+
+# if B == 0, jump to end of loop
+sub TEST_B_ZERO {
+  return <<___;
+    orr     $t0, $b0, $b1
+    orr     $t0, $t0, $b2
+
+    // reverse the bit order of $b0. This is needed for clz after this macro
+    rbit     $t1, $b0
+
+    orr     $t0, $t0, $b3
+    cbz     $t0,.Lbeeu_loop_end
+___
+}
+
+# Shift right by 1 bit, adding the modulus first if the variable is odd
+# if least_sig_bit(var0) == 0,
+#     goto shift1_<ctr>
+# else
+#     add n and goto shift1_<ctr>
+# Prerequisite: t0 = 0
+$g_next_label = 0;
+sub SHIFT1 {
+  my ($var0, $var1, $var2, $var3, $var4) = @_;
+  my $label = ".Lshift1_${g_next_label}";
+  $g_next_label++;
+  return <<___;
+    tbz     $var0, #0, $label
+    adds    $var0, $var0, $n0
+    adcs    $var1, $var1, $n1
+    adcs    $var2, $var2, $n2
+    adcs    $var3, $var3, $n3
+    adc     $var4, $var4, $t0
+$label:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+    extr    $var0, $var1, $var0, #1
+    extr    $var1, $var2, $var1, #1
+    extr    $var2, $var3, $var2, #1
+    extr    $var3, $var4, $var3, #1
+    lsr     $var4, $var4, #1
+___
+}
+
+# compilation by clang 10.0.0 with -O2/-O3 of
+#      a[0] = (a[0] >> count) | (a[1] << (64-count));
+#      a[1] = (a[1] >> count) | (a[2] << (64-count));
+#      a[2] = (a[2] >> count) | (a[3] << (64-count));
+#      a[3] >>= count;
+# Note: EXTR instruction used in SHIFT1 is similar to x86_64's SHRDQ
+# except that the second source operand of EXTR is only immediate;
+# that's why it cannot be used here where $shift is a variable
+#
+# In the following,
+# t0 := 0 - shift
+#
+# then var0, for example, will be shifted right as follows:
+# var0 := (var0 >> (uint(shift) mod 64)) | (var1 << (uint(t0) mod 64))
+# "uint() mod 64" is from the definition of LSL and LSR instructions.
+#
+# What matters here is the order of instructions relative to certain other
+# instructions, i.e.
+# - lsr and lsl must precede orr of the corresponding registers.
+# - lsl must preced the lsr of the same register afterwards.
+# The chosen order of the instructions overall is to try and maximize
+# the pipeline usage.
+sub SHIFT256 {
+  my ($var0, $var1, $var2, $var3) = @_;
+  return <<___;
+    neg $t0, $shift
+    lsr $var0, $var0, $shift
+    lsl $t1, $var1, $t0
+
+    lsr $var1, $var1, $shift
+    lsl $t2, $var2, $t0
+
+    orr $var0, $var0, $t1
+
+    lsr $var2, $var2, $shift
+    lsl $t3, $var3, $t0
+
+    orr $var1, $var1, $t2
+
+    lsr $var3, $var3, $shift
+
+    orr $var2, $var2, $t3
+___
+}
+
+$code.=<<___;
+#include "openssl/arm_arch.h"
+
+.text
+.globl  beeu_mod_inverse_vartime
+.type   beeu_mod_inverse_vartime, %function
+.align  4
+beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+    AARCH64_SIGN_LINK_REGISTER
+    stp     x29,x30,[sp,#-112]!
+    add     x29,sp,#0
+    stp     x19,x20,[sp,#16]
+    stp     x21,x22,[sp,#32]
+    stp     x23,x24,[sp,#48]
+    stp     x25,x26,[sp,#64]
+    stp     x27,x28,[sp,#80]
+    stp     x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+    ldp     $b0,$b1,[$a_in]
+    ldp     $b2,$b3,[$a_in,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+    ldp     $n0,$n1,[$n_in]
+    ldp     $n2,$n3,[$n_in,#16]
+
+    // A = a3..a0 := n
+    mov     $a0, $n0
+    mov     $a1, $n1
+    mov     $a2, $n2
+    mov     $a3, $n3
+
+    // X = x4..x0 := 1
+    mov     $x0, #1
+    eor     $x1, $x1, $x1
+    eor     $x2, $x2, $x2
+    eor     $x3, $x3, $x3
+    eor     $x4, $x4, $x4
+
+    // Y = y4..y0 := 0
+    eor     $y0, $y0, $y0
+    eor     $y1, $y1, $y1
+    eor     $y2, $y2, $y2
+    eor     $y3, $y3, $y3
+    eor     $y4, $y4, $y4
+
+.Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+    ${\TEST_B_ZERO}
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in $b0
+    // (      = number of leading 0s in $t1; see the "rbit" instruction in TEST_B_ZERO)
+    clz     $shift, $t1
+
+    // If there is no shift, goto shift_A_Y
+    cbz     $shift, .Lbeeu_shift_A_Y
+
+    // Shift B right by "$shift" bits
+    ${\SHIFT256($b0, $b1, $b2, $b3)}
+
+    // Shift X right by "$shift" bits, adding n whenever X becomes odd.
+    // $shift--;
+    // $t0 := 0; needed in the addition to the most significant word in SHIFT1
+    eor     $t0, $t0, $t0
+.Lbeeu_shift_loop_X:
+    ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
+    subs    $shift, $shift, #1
+    bne     .Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "$shift" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and $shift is decreased while executing the X loop.
+    // - SHIFT256(B, $shift) is performed before right-shifting X; they are independent
+
+.Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of $a0
+    // $shift := number of trailing 0s in $a0 (= number of leading 0s in $t1)
+    rbit    $t1, $a0
+    clz     $shift, $t1
+
+    // If there is no shift, goto |B-A|, X+Y update
+    cbz     $shift, .Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "$shift" bits
+    ${\SHIFT256($a0, $a1, $a2, $a3)}
+
+    // Shift Y right by "$shift" bits, adding n whenever Y becomes odd.
+    // $shift--;
+    // $t0 := 0; needed in the addition to the most significant word in SHIFT1
+    eor     $t0, $t0, $t0
+.Lbeeu_shift_loop_Y:
+    ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
+    subs    $shift, $shift, #1
+    bne     .Lbeeu_shift_loop_Y
+
+.Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+    subs    $t0, $b0, $a0
+    sbcs    $t1, $b1, $a1
+    sbcs    $t2, $b2, $a2
+    sbcs    $t3, $b3, $a3
+    bcs     .Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+    subs    $a0, $a0, $b0
+    sbcs    $a1, $a1, $b1
+    sbcs    $a2, $a2, $b2
+    sbcs    $a3, $a3, $b3
+
+    adds    $y0, $y0, $x0
+    adcs    $y1, $y1, $x1
+    adcs    $y2, $y2, $x2
+    adcs    $y3, $y3, $x3
+    adc     $y4, $y4, $x4
+    b       .Lbeeu_loop
+
+.Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+    mov     $b0, $t0
+    mov     $b1, $t1
+    mov     $b2, $t2
+    mov     $b3, $t3
+
+    adds    $x0, $x0, $y0
+    adcs    $x1, $x1, $y1
+    adcs    $x2, $x2, $y2
+    adcs    $x3, $x3, $y3
+    adc     $x4, $x4, $y4
+    b       .Lbeeu_loop
+
+.Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+    sub     $t0, $a0, #1
+    orr     $t0, $t0, $a1
+    orr     $t0, $t0, $a2
+    orr     $t0, $t0, $a3
+    cbnz    $t0, .Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+.Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // ($t0 = 0 from above)
+    subs    $x0, $y0, $n0
+    sbcs    $x1, $y1, $n1
+    sbcs    $x2, $y2, $n2
+    sbcs    $x3, $y3, $n3
+    sbcs    $x4, $y4, $t0
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+    csel    $y0, $x0, $y0, cs
+    csel    $y1, $x1, $y1, cs
+    csel    $y2, $x2, $y2, cs
+    csel    $y3, $x3, $y3, cs
+    csel    $y4, $x4, $y4, cs
+    bcs     .Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+    subs    $y0, $n0, $y0
+    sbcs    $y1, $n1, $y1
+    sbcs    $y2, $n2, $y2
+    sbcs    $y3, $n3, $y3
+
+    // Save Y in output (out (x0) was saved on the stack)
+    ldr     x3, [sp,#96]
+    stp     $y0, $y1, [x3]
+    stp     $y2, $y3, [x3,#16]
+    // return 1 (success)
+    mov     x0, #1
+    b       .Lbeeu_finish
+
+.Lbeeu_err:
+    // return 0 (error)
+    eor     x0, x0, x0
+
+.Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+    add     sp,x29,#0
+    ldp     x19,x20,[sp,#16]
+    ldp     x21,x22,[sp,#32]
+    ldp     x23,x24,[sp,#48]
+    ldp     x25,x26,[sp,#64]
+    ldp     x27,x28,[sp,#80]
+    ldp     x29,x30,[sp],#112
+
+    AARCH64_VALIDATE_LINK_REGISTER
+    ret
+.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
+___
+
+
+foreach (split("\n",$code)) {
+    s/\`([^\`]*)\`/eval $1/ge;
+
+    print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
diff --git a/crypto/fipsmodule/ec/ec.c b/crypto/fipsmodule/ec/ec.c
index 93fdcfc..133f561 100644
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -246,7 +246,8 @@
   out->curves[2].param_len = 32;
   out->curves[2].params = kP256Params;
   out->curves[2].method =
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&   \
     !defined(OPENSSL_SMALL)
       EC_GFp_nistz256_method();
 #else
diff --git a/crypto/fipsmodule/ec/make_p256-x86_64-tests.go b/crypto/fipsmodule/ec/make_p256-nistz-tests.go
similarity index 96%
rename from crypto/fipsmodule/ec/make_p256-x86_64-tests.go
rename to crypto/fipsmodule/ec/make_p256-nistz-tests.go
index 958a97a..36194e6 100644
--- a/crypto/fipsmodule/ec/make_p256-x86_64-tests.go
+++ b/crypto/fipsmodule/ec/make_p256-nistz-tests.go
@@ -69,7 +69,7 @@
 
 func isAffineInfinity(x, y *big.Int) bool {
 	// Infinity, in affine coordinates, is represented as (0, 0) by
-	// both Go and p256-x86_64-asm.pl.
+	// both Go, p256-x86_64-asm.pl and p256-armv8-asm.pl.
 	return x.Sign() == 0 && y.Sign() == 0
 }
 
@@ -107,8 +107,8 @@
 		// arbitrary X and Y and include the special case. We also have
 		// not verified that add and double preserve this
 		// property. Thus, generate test vectors with unrelated X and Y,
-		// to test that p256-x86_64-asm.pl correctly handles
-		// unconstrained representations of infinity.
+		// to test that p256-x86_64-asm.pl and p256-armv8-asm.pl correctly
+		// handle unconstrained representations of infinity.
 		x = randNonZeroInt(p)
 		y = randNonZeroInt(p)
 		z = zero
diff --git a/crypto/fipsmodule/ec/make_tables.go b/crypto/fipsmodule/ec/make_tables.go
index 34e8c23..dbcaab0 100644
--- a/crypto/fipsmodule/ec/make_tables.go
+++ b/crypto/fipsmodule/ec/make_tables.go
@@ -23,8 +23,8 @@
 )
 
 func main() {
-	if err := writeP256X86_64Table("p256-x86_64-table.h"); err != nil {
-		fmt.Fprintf(os.Stderr, "Error writing p256-x86_64-table.h: %s\n", err)
+	if err := writeP256NistzTable("p256-nistz-table.h"); err != nil {
+		fmt.Fprintf(os.Stderr, "Error writing p256-nistz-table.h: %s\n", err)
 		os.Exit(1)
 	}
 
@@ -34,7 +34,7 @@
 	}
 }
 
-func writeP256X86_64Table(path string) error {
+func writeP256NistzTable(path string) error {
 	curve := elliptic.P256()
 	tables := make([][][2]*big.Int, 0, 37)
 	for shift := 0; shift < 256; shift += 7 {
@@ -59,7 +59,7 @@
  */
 
 // This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
 // subtables, each subtable contains 64 affine points. The affine points are
 // encoded as eight uint64's, four for the x coordinate and four for the y.
 // Both values are in little-endian order. There are 37 tables because a
diff --git a/crypto/fipsmodule/ec/p256-x86_64-table.h b/crypto/fipsmodule/ec/p256-nistz-table.h
similarity index 99%
rename from crypto/fipsmodule/ec/p256-x86_64-table.h
rename to crypto/fipsmodule/ec/p256-nistz-table.h
index 3af0b01..b81480b 100644
--- a/crypto/fipsmodule/ec/p256-x86_64-table.h
+++ b/crypto/fipsmodule/ec/p256-nistz-table.h
@@ -9,7 +9,7 @@
  */
 
 // This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
 // subtables, each subtable contains 64 affine points. The affine points are
 // encoded as eight uint64's, four for the x coordinate and four for the y.
 // Both values are in little-endian order. There are 37 tables because a
diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-nistz.c
similarity index 98%
rename from crypto/fipsmodule/ec/p256-x86_64.c
rename to crypto/fipsmodule/ec/p256-nistz.c
index 506b7d2..12a4da6 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@@ -30,10 +30,10 @@
 #include "../delocate.h"
 #include "../../internal.h"
 #include "internal.h"
-#include "p256-x86_64.h"
+#include "p256-nistz.h"
 
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) &&  \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&    \
     !defined(OPENSSL_SMALL)
 
 typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
@@ -45,7 +45,7 @@
 };
 
 // Precomputed tables for the default generator
-#include "p256-x86_64-table.h"
+#include "p256-nistz-table.h"
 
 // Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in
 // util.c for details
@@ -554,10 +554,12 @@
 static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group,
                                                  EC_SCALAR *out,
                                                  const EC_SCALAR *in) {
+#if defined(OPENSSL_X86_64)
   if (!CRYPTO_is_AVX_capable()) {
     // No AVX support; fallback to generic code.
     return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in);
   }
+#endif
 
   assert(group->order.width == P256_LIMBS);
   if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.d)) {
@@ -628,5 +630,6 @@
   out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
 }
 
-#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#endif /* !defined(OPENSSL_NO_ASM) && \
+          (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&  \
           !defined(OPENSSL_SMALL) */
diff --git a/crypto/fipsmodule/ec/p256-x86_64.h b/crypto/fipsmodule/ec/p256-nistz.h
similarity index 95%
rename from crypto/fipsmodule/ec/p256-x86_64.h
rename to crypto/fipsmodule/ec/p256-nistz.h
index 5deb81a..0d0a6be 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/crypto/fipsmodule/ec/p256-nistz.h
@@ -30,7 +30,8 @@
 #endif
 
 
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&   \
     !defined(OPENSSL_SMALL)
 
 // P-256 field operations.
@@ -142,8 +143,9 @@
 void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
                                    const P256_POINT_AFFINE *b);
 
-#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
-           !defined(OPENSSL_SMALL) */
+#endif /* !defined(OPENSSL_NO_ASM) && \
+          (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&   \
+          !defined(OPENSSL_SMALL) */
 
 
 #if defined(__cplusplus)
diff --git a/crypto/fipsmodule/ec/p256-x86_64_test.cc b/crypto/fipsmodule/ec/p256-nistz_test.cc
similarity index 97%
rename from crypto/fipsmodule/ec/p256-x86_64_test.cc
rename to crypto/fipsmodule/ec/p256-nistz_test.cc
index f6f070a..73944db 100644
--- a/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/crypto/fipsmodule/ec/p256-nistz_test.cc
@@ -30,15 +30,16 @@
 #include "../../test/abi_test.h"
 #include "../../test/file_test.h"
 #include "../../test/test_util.h"
-#include "p256-x86_64.h"
+#include "p256-nistz.h"
 
 
 // Disable tests if BORINGSSL_SHARED_LIBRARY is defined. These tests need access
 // to internal functions.
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) &&  \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) &&  \
     !defined(OPENSSL_SMALL) && !defined(BORINGSSL_SHARED_LIBRARY)
 
-TEST(P256_X86_64Test, SelectW5) {
+TEST(P256_NistzTest, SelectW5) {
   // Fill a table with some garbage input.
   alignas(64) P256_POINT table[16];
   for (size_t i = 0; i < 16; i++) {
@@ -68,7 +69,7 @@
   CHECK_ABI(ecp_nistz256_select_w5, &val, table, 7);
 }
 
-TEST(P256_X86_64Test, SelectW7) {
+TEST(P256_NistzTest, SelectW7) {
   // Fill a table with some garbage input.
   alignas(64) P256_POINT_AFFINE table[64];
   for (size_t i = 0; i < 64; i++) {
@@ -97,11 +98,13 @@
   CHECK_ABI(ecp_nistz256_select_w7, &val, table, 42);
 }
 
-TEST(P256_X86_64Test, BEEU) {
+TEST(P256_NistzTest, BEEU) {
+#if defined(OPENSSL_X86_64)
   if (!CRYPTO_is_AVX_capable()) {
     // No AVX support; cannot run the BEEU code.
     return;
   }
+#endif
 
   bssl::UniquePtr<EC_GROUP> group(
       EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
@@ -483,8 +486,8 @@
   }
 }
 
-TEST(P256_X86_64Test, TestVectors) {
-  return FileTestGTest("crypto/fipsmodule/ec/p256-x86_64_tests.txt",
+TEST(P256_NistzTest, TestVectors) {
+  return FileTestGTest("crypto/fipsmodule/ec/p256-nistz_tests.txt",
                        [](FileTest *t) {
     if (t->GetParameter() == "Negate") {
       TestNegate(t);
@@ -503,7 +506,7 @@
 }
 
 // Instrument the functions covered in TestVectors for ABI checking.
-TEST(P256_X86_64Test, ABI) {
+TEST(P256_NistzTest, ABI) {
   BN_ULONG a[P256_LIMBS], b[P256_LIMBS], c[P256_LIMBS];
   OPENSSL_memset(a, 0x01, sizeof(a));
   // These functions are all constant-time, so it is only necessary to
diff --git a/crypto/fipsmodule/ec/p256-x86_64_tests.txt b/crypto/fipsmodule/ec/p256-nistz_tests.txt
similarity index 100%
rename from crypto/fipsmodule/ec/p256-x86_64_tests.txt
rename to crypto/fipsmodule/ec/p256-nistz_tests.txt
diff --git a/sources.cmake b/sources.cmake
index 3d3465f..434b3c2 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -49,7 +49,7 @@
   crypto/fipsmodule/bn/bn_tests.txt
   crypto/fipsmodule/bn/miller_rabin_tests.txt
   crypto/fipsmodule/ec/ec_scalar_base_mult_tests.txt
-  crypto/fipsmodule/ec/p256-x86_64_tests.txt
+  crypto/fipsmodule/ec/p256-nistz_tests.txt
   crypto/fipsmodule/ecdsa/ecdsa_sign_tests.txt
   crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt
   crypto/fipsmodule/modes/gcm_tests.txt
diff --git a/util/fipstools/delocate/delocate.go b/util/fipstools/delocate/delocate.go
index 5d4b1f4..55c8671 100644
--- a/util/fipstools/delocate/delocate.go
+++ b/util/fipstools/delocate/delocate.go
@@ -509,7 +509,7 @@
 				// This is a branch. Either the target needs to be written to a local
 				// version of the symbol to ensure that no relocations are emitted, or
 				// it needs to jump to a redirector function.
-				symbol, _, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up)
+				symbol, offset, _, didChange, symbolIsLocal, _ := d.parseMemRef(arg.up)
 				changed = didChange
 
 				if _, knownSymbol := d.symbols[symbol]; knownSymbol {
@@ -520,6 +520,13 @@
 					d.redirectors[symbol] = redirector
 					symbol = redirector
 					changed = true
+				} else if didChange && symbolIsLocal && len(offset) > 0 {
+					// didChange is set when the inputFile index is not 0; which is the index of the
+					// first file copied to the output, which is the generated assembly of bcm.c.
+					// In subsequently copied assembly files, local symbols are changed by appending (BCM_ + index)
+					// in order to ensure they don't collide. `index` gets incremented per file.
+					// If there is offset after the symbol, append the `offset`.
+					symbol = symbol + offset
 				}
 
 				args = append(args, symbol)
diff --git a/util/fipstools/delocate/delocate.peg b/util/fipstools/delocate/delocate.peg
index c253a48..8267065 100644
--- a/util/fipstools/delocate/delocate.peg
+++ b/util/fipstools/delocate/delocate.peg
@@ -94,7 +94,7 @@
               BaseIndexScale)
 SymbolRef <- (Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)?
 Low12BitsSymbolRef <- ":lo12:" (LocalSymbol / SymbolName) Offset?
-ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?
+ARMBaseIndexScale <- '[' ARMRegister (',' WS? (('#' Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / (('+' [0-9]+)*))? ) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?
 ARMGOTLow12 <- ":got_lo12:" SymbolName
 ARMPostincrement <- '!'
 BaseIndexScale <- '(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)? )? ')'
diff --git a/util/fipstools/delocate/delocate.peg.go b/util/fipstools/delocate/delocate.peg.go
index ea7c195..6f5c654 100644
--- a/util/fipstools/delocate/delocate.peg.go
+++ b/util/fipstools/delocate/delocate.peg.go
@@ -1,10 +1,14 @@
 package main
 
+// Code generated by ./peg/peg delocate.peg DO NOT EDIT.
+
 import (
 	"fmt"
-	"math"
+	"io"
+	"os"
 	"sort"
 	"strconv"
+	"strings"
 )
 
 const endSymbol rune = 1114112
@@ -142,19 +146,19 @@
 	up, next *node32
 }
 
-func (node *node32) print(pretty bool, buffer string) {
+func (node *node32) print(w io.Writer, pretty bool, buffer string) {
 	var print func(node *node32, depth int)
 	print = func(node *node32, depth int) {
 		for node != nil {
 			for c := 0; c < depth; c++ {
-				fmt.Printf(" ")
+				fmt.Fprintf(w, " ")
 			}
 			rule := rul3s[node.pegRule]
 			quote := strconv.Quote(string(([]rune(buffer)[node.begin:node.end])))
 			if !pretty {
-				fmt.Printf("%v %v\n", rule, quote)
+				fmt.Fprintf(w, "%v %v\n", rule, quote)
 			} else {
-				fmt.Printf("\x1B[34m%v\x1B[m %v\n", rule, quote)
+				fmt.Fprintf(w, "\x1B[36m%v\x1B[m %v\n", rule, quote)
 			}
 			if node.up != nil {
 				print(node.up, depth+1)
@@ -165,12 +169,12 @@
 	print(node, 0)
 }
 
-func (node *node32) Print(buffer string) {
-	node.print(false, buffer)
+func (node *node32) Print(w io.Writer, buffer string) {
+	node.print(w, false, buffer)
 }
 
-func (node *node32) PrettyPrint(buffer string) {
-	node.print(true, buffer)
+func (node *node32) PrettyPrint(w io.Writer, buffer string) {
+	node.print(w, true, buffer)
 }
 
 type tokens32 struct {
@@ -213,24 +217,24 @@
 }
 
 func (t *tokens32) PrintSyntaxTree(buffer string) {
-	t.AST().Print(buffer)
+	t.AST().Print(os.Stdout, buffer)
+}
+
+func (t *tokens32) WriteSyntaxTree(w io.Writer, buffer string) {
+	t.AST().Print(w, buffer)
 }
 
 func (t *tokens32) PrettyPrintSyntaxTree(buffer string) {
-	t.AST().PrettyPrint(buffer)
+	t.AST().PrettyPrint(os.Stdout, buffer)
 }
 
 func (t *tokens32) Add(rule pegRule, begin, end, index uint32) {
-	if tree := t.tree; int(index) >= len(tree) {
-		expanded := make([]token32, 2*len(tree))
-		copy(expanded, tree)
-		t.tree = expanded
+	tree, i := t.tree, int(index)
+	if i >= len(tree) {
+		t.tree = append(tree, token32{pegRule: rule, begin: begin, end: end})
+		return
 	}
-	t.tree[index] = token32{
-		pegRule: rule,
-		begin:   begin,
-		end:     end,
-	}
+	tree[i] = token32{pegRule: rule, begin: begin, end: end}
 }
 
 func (t *tokens32) Tokens() []token32 {
@@ -292,7 +296,7 @@
 }
 
 func (e *parseError) Error() string {
-	tokens, error := []token32{e.max}, "\n"
+	tokens, err := []token32{e.max}, "\n"
 	positions, p := make([]int, 2*len(tokens)), 0
 	for _, token := range tokens {
 		positions[p], p = int(token.begin), p+1
@@ -305,14 +309,14 @@
 	}
 	for _, token := range tokens {
 		begin, end := int(token.begin), int(token.end)
-		error += fmt.Sprintf(format,
+		err += fmt.Sprintf(format,
 			rul3s[token.pegRule],
 			translations[begin].line, translations[begin].symbol,
 			translations[end].line, translations[end].symbol,
 			strconv.Quote(string(e.p.buffer[begin:end])))
 	}
 
-	return error
+	return err
 }
 
 func (p *Asm) PrintSyntaxTree() {
@@ -323,12 +327,41 @@
 	}
 }
 
-func (p *Asm) Init() {
+func (p *Asm) WriteSyntaxTree(w io.Writer) {
+	p.tokens32.WriteSyntaxTree(w, p.Buffer)
+}
+
+func (p *Asm) SprintSyntaxTree() string {
+	var bldr strings.Builder
+	p.WriteSyntaxTree(&bldr)
+	return bldr.String()
+}
+
+func Pretty(pretty bool) func(*Asm) error {
+	return func(p *Asm) error {
+		p.Pretty = pretty
+		return nil
+	}
+}
+
+func Size(size int) func(*Asm) error {
+	return func(p *Asm) error {
+		p.tokens32 = tokens32{tree: make([]token32, 0, size)}
+		return nil
+	}
+}
+func (p *Asm) Init(options ...func(*Asm) error) error {
 	var (
 		max                  token32
 		position, tokenIndex uint32
 		buffer               []rune
 	)
+	for _, option := range options {
+		err := option(p)
+		if err != nil {
+			return err
+		}
+	}
 	p.reset = func() {
 		max = token32{}
 		position, tokenIndex = 0, 0
@@ -342,7 +375,7 @@
 	p.reset()
 
 	_rules := p.rules
-	tree := tokens32{tree: make([]token32, math.MaxInt16)}
+	tree := p.tokens32
 	p.parse = func(rule ...int) error {
 		r := 1
 		if len(rule) > 0 {
@@ -5708,7 +5741,7 @@
 			position, tokenIndex = position727, tokenIndex727
 			return false
 		},
-		/* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */
+		/* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / ('+' [0-9]+)*)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */
 		func() bool {
 			position737, tokenIndex737 := position, tokenIndex
 			{
@@ -5747,27 +5780,108 @@
 						}
 						{
 							position745, tokenIndex745 := position, tokenIndex
-							if buffer[position] != rune('*') {
-								goto l745
-							}
-							position++
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l745
-							}
-							position++
-						l747:
 							{
-								position748, tokenIndex748 := position, tokenIndex
+								position747, tokenIndex747 := position, tokenIndex
+								if buffer[position] != rune('*') {
+									goto l748
+								}
+								position++
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
 									goto l748
 								}
 								position++
+							l749:
+								{
+									position750, tokenIndex750 := position, tokenIndex
+									if c := buffer[position]; c < rune('0') || c > rune('9') {
+										goto l750
+									}
+									position++
+									goto l749
+								l750:
+									position, tokenIndex = position750, tokenIndex750
+								}
 								goto l747
 							l748:
-								position, tokenIndex = position748, tokenIndex748
+								position, tokenIndex = position747, tokenIndex747
+								if buffer[position] != rune('*') {
+									goto l751
+								}
+								position++
+								if buffer[position] != rune('(') {
+									goto l751
+								}
+								position++
+								if c := buffer[position]; c < rune('0') || c > rune('9') {
+									goto l751
+								}
+								position++
+							l752:
+								{
+									position753, tokenIndex753 := position, tokenIndex
+									if c := buffer[position]; c < rune('0') || c > rune('9') {
+										goto l753
+									}
+									position++
+									goto l752
+								l753:
+									position, tokenIndex = position753, tokenIndex753
+								}
+								if !_rules[ruleOperator]() {
+									goto l751
+								}
+								if c := buffer[position]; c < rune('0') || c > rune('9') {
+									goto l751
+								}
+								position++
+							l754:
+								{
+									position755, tokenIndex755 := position, tokenIndex
+									if c := buffer[position]; c < rune('0') || c > rune('9') {
+										goto l755
+									}
+									position++
+									goto l754
+								l755:
+									position, tokenIndex = position755, tokenIndex755
+								}
+								if buffer[position] != rune(')') {
+									goto l751
+								}
+								position++
+								goto l747
+							l751:
+								position, tokenIndex = position747, tokenIndex747
+							l756:
+								{
+									position757, tokenIndex757 := position, tokenIndex
+									if buffer[position] != rune('+') {
+										goto l757
+									}
+									position++
+									if c := buffer[position]; c < rune('0') || c > rune('9') {
+										goto l757
+									}
+									position++
+								l758:
+									{
+										position759, tokenIndex759 := position, tokenIndex
+										if c := buffer[position]; c < rune('0') || c > rune('9') {
+											goto l759
+										}
+										position++
+										goto l758
+									l759:
+										position, tokenIndex = position759, tokenIndex759
+									}
+									goto l756
+								l757:
+									position, tokenIndex = position757, tokenIndex757
+								}
 							}
+						l747:
 							goto l746
-						l745:
+
 							position, tokenIndex = position745, tokenIndex745
 						}
 					l746:
@@ -5775,16 +5889,16 @@
 					l744:
 						position, tokenIndex = position743, tokenIndex743
 						if !_rules[ruleARMGOTLow12]() {
-							goto l749
+							goto l760
 						}
 						goto l743
-					l749:
+					l760:
 						position, tokenIndex = position743, tokenIndex743
 						if !_rules[ruleLow12BitsSymbolRef]() {
-							goto l750
+							goto l761
 						}
 						goto l743
-					l750:
+					l761:
 						position, tokenIndex = position743, tokenIndex743
 						if !_rules[ruleARMRegister]() {
 							goto l739
@@ -5792,29 +5906,29 @@
 					}
 				l743:
 					{
-						position751, tokenIndex751 := position, tokenIndex
+						position762, tokenIndex762 := position, tokenIndex
 						if buffer[position] != rune(',') {
-							goto l751
+							goto l762
 						}
 						position++
 						{
-							position753, tokenIndex753 := position, tokenIndex
+							position764, tokenIndex764 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l753
+								goto l764
 							}
-							goto l754
-						l753:
-							position, tokenIndex = position753, tokenIndex753
+							goto l765
+						l764:
+							position, tokenIndex = position764, tokenIndex764
 						}
-					l754:
+					l765:
 						if !_rules[ruleARMConstantTweak]() {
-							goto l751
+							goto l762
 						}
-						goto l752
-					l751:
-						position, tokenIndex = position751, tokenIndex751
+						goto l763
+					l762:
+						position, tokenIndex = position762, tokenIndex762
 					}
-				l752:
+				l763:
 					goto l740
 				l739:
 					position, tokenIndex = position739, tokenIndex739
@@ -5825,15 +5939,15 @@
 				}
 				position++
 				{
-					position755, tokenIndex755 := position, tokenIndex
+					position766, tokenIndex766 := position, tokenIndex
 					if !_rules[ruleARMPostincrement]() {
-						goto l755
+						goto l766
 					}
-					goto l756
-				l755:
-					position, tokenIndex = position755, tokenIndex755
+					goto l767
+				l766:
+					position, tokenIndex = position766, tokenIndex766
 				}
-			l756:
+			l767:
 				add(ruleARMBaseIndexScale, position738)
 			}
 			return true
@@ -5843,566 +5957,567 @@
 		},
 		/* 47 ARMGOTLow12 <- <(':' ('g' / 'G') ('o' / 'O') ('t' / 'T') '_' ('l' / 'L') ('o' / 'O') '1' '2' ':' SymbolName)> */
 		func() bool {
-			position757, tokenIndex757 := position, tokenIndex
+			position768, tokenIndex768 := position, tokenIndex
 			{
-				position758 := position
+				position769 := position
 				if buffer[position] != rune(':') {
-					goto l757
+					goto l768
 				}
 				position++
 				{
-					position759, tokenIndex759 := position, tokenIndex
+					position770, tokenIndex770 := position, tokenIndex
 					if buffer[position] != rune('g') {
-						goto l760
+						goto l771
 					}
 					position++
-					goto l759
-				l760:
-					position, tokenIndex = position759, tokenIndex759
+					goto l770
+				l771:
+					position, tokenIndex = position770, tokenIndex770
 					if buffer[position] != rune('G') {
-						goto l757
-					}
-					position++
-				}
-			l759:
-				{
-					position761, tokenIndex761 := position, tokenIndex
-					if buffer[position] != rune('o') {
-						goto l762
-					}
-					position++
-					goto l761
-				l762:
-					position, tokenIndex = position761, tokenIndex761
-					if buffer[position] != rune('O') {
-						goto l757
-					}
-					position++
-				}
-			l761:
-				{
-					position763, tokenIndex763 := position, tokenIndex
-					if buffer[position] != rune('t') {
-						goto l764
-					}
-					position++
-					goto l763
-				l764:
-					position, tokenIndex = position763, tokenIndex763
-					if buffer[position] != rune('T') {
-						goto l757
-					}
-					position++
-				}
-			l763:
-				if buffer[position] != rune('_') {
-					goto l757
-				}
-				position++
-				{
-					position765, tokenIndex765 := position, tokenIndex
-					if buffer[position] != rune('l') {
-						goto l766
-					}
-					position++
-					goto l765
-				l766:
-					position, tokenIndex = position765, tokenIndex765
-					if buffer[position] != rune('L') {
-						goto l757
-					}
-					position++
-				}
-			l765:
-				{
-					position767, tokenIndex767 := position, tokenIndex
-					if buffer[position] != rune('o') {
 						goto l768
 					}
 					position++
-					goto l767
-				l768:
-					position, tokenIndex = position767, tokenIndex767
+				}
+			l770:
+				{
+					position772, tokenIndex772 := position, tokenIndex
+					if buffer[position] != rune('o') {
+						goto l773
+					}
+					position++
+					goto l772
+				l773:
+					position, tokenIndex = position772, tokenIndex772
 					if buffer[position] != rune('O') {
-						goto l757
+						goto l768
 					}
 					position++
 				}
-			l767:
+			l772:
+				{
+					position774, tokenIndex774 := position, tokenIndex
+					if buffer[position] != rune('t') {
+						goto l775
+					}
+					position++
+					goto l774
+				l775:
+					position, tokenIndex = position774, tokenIndex774
+					if buffer[position] != rune('T') {
+						goto l768
+					}
+					position++
+				}
+			l774:
+				if buffer[position] != rune('_') {
+					goto l768
+				}
+				position++
+				{
+					position776, tokenIndex776 := position, tokenIndex
+					if buffer[position] != rune('l') {
+						goto l777
+					}
+					position++
+					goto l776
+				l777:
+					position, tokenIndex = position776, tokenIndex776
+					if buffer[position] != rune('L') {
+						goto l768
+					}
+					position++
+				}
+			l776:
+				{
+					position778, tokenIndex778 := position, tokenIndex
+					if buffer[position] != rune('o') {
+						goto l779
+					}
+					position++
+					goto l778
+				l779:
+					position, tokenIndex = position778, tokenIndex778
+					if buffer[position] != rune('O') {
+						goto l768
+					}
+					position++
+				}
+			l778:
 				if buffer[position] != rune('1') {
-					goto l757
+					goto l768
 				}
 				position++
 				if buffer[position] != rune('2') {
-					goto l757
+					goto l768
 				}
 				position++
 				if buffer[position] != rune(':') {
-					goto l757
+					goto l768
 				}
 				position++
 				if !_rules[ruleSymbolName]() {
-					goto l757
+					goto l768
 				}
-				add(ruleARMGOTLow12, position758)
+				add(ruleARMGOTLow12, position769)
 			}
 			return true
-		l757:
-			position, tokenIndex = position757, tokenIndex757
+		l768:
+			position, tokenIndex = position768, tokenIndex768
 			return false
 		},
 		/* 48 ARMPostincrement <- <'!'> */
 		func() bool {
-			position769, tokenIndex769 := position, tokenIndex
+			position780, tokenIndex780 := position, tokenIndex
 			{
-				position770 := position
+				position781 := position
 				if buffer[position] != rune('!') {
-					goto l769
+					goto l780
 				}
 				position++
-				add(ruleARMPostincrement, position770)
+				add(ruleARMPostincrement, position781)
 			}
 			return true
-		l769:
-			position, tokenIndex = position769, tokenIndex769
+		l780:
+			position, tokenIndex = position780, tokenIndex780
 			return false
 		},
 		/* 49 BaseIndexScale <- <('(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)?)? ')')> */
 		func() bool {
-			position771, tokenIndex771 := position, tokenIndex
+			position782, tokenIndex782 := position, tokenIndex
 			{
-				position772 := position
+				position783 := position
 				if buffer[position] != rune('(') {
-					goto l771
+					goto l782
 				}
 				position++
 				{
-					position773, tokenIndex773 := position, tokenIndex
+					position784, tokenIndex784 := position, tokenIndex
 					if !_rules[ruleRegisterOrConstant]() {
-						goto l773
+						goto l784
 					}
-					goto l774
-				l773:
-					position, tokenIndex = position773, tokenIndex773
+					goto l785
+				l784:
+					position, tokenIndex = position784, tokenIndex784
 				}
-			l774:
+			l785:
 				{
-					position775, tokenIndex775 := position, tokenIndex
+					position786, tokenIndex786 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l775
+						goto l786
 					}
-					goto l776
-				l775:
-					position, tokenIndex = position775, tokenIndex775
+					goto l787
+				l786:
+					position, tokenIndex = position786, tokenIndex786
 				}
-			l776:
+			l787:
 				{
-					position777, tokenIndex777 := position, tokenIndex
+					position788, tokenIndex788 := position, tokenIndex
 					if buffer[position] != rune(',') {
-						goto l777
+						goto l788
 					}
 					position++
 					{
-						position779, tokenIndex779 := position, tokenIndex
+						position790, tokenIndex790 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l779
+							goto l790
 						}
-						goto l780
-					l779:
-						position, tokenIndex = position779, tokenIndex779
+						goto l791
+					l790:
+						position, tokenIndex = position790, tokenIndex790
 					}
-				l780:
+				l791:
 					if !_rules[ruleRegisterOrConstant]() {
-						goto l777
+						goto l788
 					}
 					{
-						position781, tokenIndex781 := position, tokenIndex
+						position792, tokenIndex792 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l781
+							goto l792
 						}
-						goto l782
-					l781:
-						position, tokenIndex = position781, tokenIndex781
+						goto l793
+					l792:
+						position, tokenIndex = position792, tokenIndex792
 					}
-				l782:
+				l793:
 					{
-						position783, tokenIndex783 := position, tokenIndex
+						position794, tokenIndex794 := position, tokenIndex
 						if buffer[position] != rune(',') {
-							goto l783
+							goto l794
 						}
 						position++
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l783
+							goto l794
 						}
 						position++
-					l785:
+					l796:
 						{
-							position786, tokenIndex786 := position, tokenIndex
+							position797, tokenIndex797 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l786
+								goto l797
 							}
 							position++
-							goto l785
-						l786:
-							position, tokenIndex = position786, tokenIndex786
+							goto l796
+						l797:
+							position, tokenIndex = position797, tokenIndex797
 						}
-						goto l784
-					l783:
-						position, tokenIndex = position783, tokenIndex783
+						goto l795
+					l794:
+						position, tokenIndex = position794, tokenIndex794
 					}
-				l784:
-					goto l778
-				l777:
-					position, tokenIndex = position777, tokenIndex777
+				l795:
+					goto l789
+				l788:
+					position, tokenIndex = position788, tokenIndex788
 				}
-			l778:
+			l789:
 				if buffer[position] != rune(')') {
-					goto l771
+					goto l782
 				}
 				position++
-				add(ruleBaseIndexScale, position772)
+				add(ruleBaseIndexScale, position783)
 			}
 			return true
-		l771:
-			position, tokenIndex = position771, tokenIndex771
+		l782:
+			position, tokenIndex = position782, tokenIndex782
 			return false
 		},
 		/* 50 Operator <- <('+' / '-')> */
 		func() bool {
-			position787, tokenIndex787 := position, tokenIndex
+			position798, tokenIndex798 := position, tokenIndex
 			{
-				position788 := position
+				position799 := position
 				{
-					position789, tokenIndex789 := position, tokenIndex
+					position800, tokenIndex800 := position, tokenIndex
 					if buffer[position] != rune('+') {
-						goto l790
+						goto l801
 					}
 					position++
-					goto l789
-				l790:
-					position, tokenIndex = position789, tokenIndex789
+					goto l800
+				l801:
+					position, tokenIndex = position800, tokenIndex800
 					if buffer[position] != rune('-') {
-						goto l787
+						goto l798
 					}
 					position++
 				}
-			l789:
-				add(ruleOperator, position788)
+			l800:
+				add(ruleOperator, position799)
 			}
 			return true
-		l787:
-			position, tokenIndex = position787, tokenIndex787
+		l798:
+			position, tokenIndex = position798, tokenIndex798
 			return false
 		},
 		/* 51 Offset <- <('+'? '-'? (('0' ('b' / 'B') ('0' / '1')+) / ('0' ('x' / 'X') ([0-9] / [0-9] / ([a-f] / [A-F]))+) / [0-9]+))> */
 		func() bool {
-			position791, tokenIndex791 := position, tokenIndex
+			position802, tokenIndex802 := position, tokenIndex
 			{
-				position792 := position
+				position803 := position
 				{
-					position793, tokenIndex793 := position, tokenIndex
+					position804, tokenIndex804 := position, tokenIndex
 					if buffer[position] != rune('+') {
-						goto l793
+						goto l804
 					}
 					position++
-					goto l794
-				l793:
-					position, tokenIndex = position793, tokenIndex793
+					goto l805
+				l804:
+					position, tokenIndex = position804, tokenIndex804
 				}
-			l794:
+			l805:
 				{
-					position795, tokenIndex795 := position, tokenIndex
+					position806, tokenIndex806 := position, tokenIndex
 					if buffer[position] != rune('-') {
-						goto l795
+						goto l806
 					}
 					position++
-					goto l796
-				l795:
-					position, tokenIndex = position795, tokenIndex795
+					goto l807
+				l806:
+					position, tokenIndex = position806, tokenIndex806
 				}
-			l796:
+			l807:
 				{
-					position797, tokenIndex797 := position, tokenIndex
+					position808, tokenIndex808 := position, tokenIndex
 					if buffer[position] != rune('0') {
-						goto l798
+						goto l809
 					}
 					position++
 					{
-						position799, tokenIndex799 := position, tokenIndex
+						position810, tokenIndex810 := position, tokenIndex
 						if buffer[position] != rune('b') {
-							goto l800
+							goto l811
 						}
 						position++
-						goto l799
-					l800:
-						position, tokenIndex = position799, tokenIndex799
+						goto l810
+					l811:
+						position, tokenIndex = position810, tokenIndex810
 						if buffer[position] != rune('B') {
-							goto l798
-						}
-						position++
-					}
-				l799:
-					{
-						position803, tokenIndex803 := position, tokenIndex
-						if buffer[position] != rune('0') {
-							goto l804
-						}
-						position++
-						goto l803
-					l804:
-						position, tokenIndex = position803, tokenIndex803
-						if buffer[position] != rune('1') {
-							goto l798
-						}
-						position++
-					}
-				l803:
-				l801:
-					{
-						position802, tokenIndex802 := position, tokenIndex
-						{
-							position805, tokenIndex805 := position, tokenIndex
-							if buffer[position] != rune('0') {
-								goto l806
-							}
-							position++
-							goto l805
-						l806:
-							position, tokenIndex = position805, tokenIndex805
-							if buffer[position] != rune('1') {
-								goto l802
-							}
-							position++
-						}
-					l805:
-						goto l801
-					l802:
-						position, tokenIndex = position802, tokenIndex802
-					}
-					goto l797
-				l798:
-					position, tokenIndex = position797, tokenIndex797
-					if buffer[position] != rune('0') {
-						goto l807
-					}
-					position++
-					{
-						position808, tokenIndex808 := position, tokenIndex
-						if buffer[position] != rune('x') {
 							goto l809
 						}
 						position++
-						goto l808
-					l809:
-						position, tokenIndex = position808, tokenIndex808
-						if buffer[position] != rune('X') {
-							goto l807
-						}
-						position++
 					}
-				l808:
-					{
-						position812, tokenIndex812 := position, tokenIndex
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l813
-						}
-						position++
-						goto l812
-					l813:
-						position, tokenIndex = position812, tokenIndex812
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l814
-						}
-						position++
-						goto l812
-					l814:
-						position, tokenIndex = position812, tokenIndex812
-						{
-							position815, tokenIndex815 := position, tokenIndex
-							if c := buffer[position]; c < rune('a') || c > rune('f') {
-								goto l816
-							}
-							position++
-							goto l815
-						l816:
-							position, tokenIndex = position815, tokenIndex815
-							if c := buffer[position]; c < rune('A') || c > rune('F') {
-								goto l807
-							}
-							position++
-						}
-					l815:
-					}
-				l812:
 				l810:
 					{
-						position811, tokenIndex811 := position, tokenIndex
-						{
-							position817, tokenIndex817 := position, tokenIndex
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l818
-							}
-							position++
-							goto l817
-						l818:
-							position, tokenIndex = position817, tokenIndex817
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l819
-							}
-							position++
-							goto l817
-						l819:
-							position, tokenIndex = position817, tokenIndex817
-							{
-								position820, tokenIndex820 := position, tokenIndex
-								if c := buffer[position]; c < rune('a') || c > rune('f') {
-									goto l821
-								}
-								position++
-								goto l820
-							l821:
-								position, tokenIndex = position820, tokenIndex820
-								if c := buffer[position]; c < rune('A') || c > rune('F') {
-									goto l811
-								}
-								position++
-							}
-						l820:
+						position814, tokenIndex814 := position, tokenIndex
+						if buffer[position] != rune('0') {
+							goto l815
 						}
-					l817:
-						goto l810
-					l811:
-						position, tokenIndex = position811, tokenIndex811
+						position++
+						goto l814
+					l815:
+						position, tokenIndex = position814, tokenIndex814
+						if buffer[position] != rune('1') {
+							goto l809
+						}
+						position++
 					}
-					goto l797
-				l807:
-					position, tokenIndex = position797, tokenIndex797
-					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l791
+				l814:
+				l812:
+					{
+						position813, tokenIndex813 := position, tokenIndex
+						{
+							position816, tokenIndex816 := position, tokenIndex
+							if buffer[position] != rune('0') {
+								goto l817
+							}
+							position++
+							goto l816
+						l817:
+							position, tokenIndex = position816, tokenIndex816
+							if buffer[position] != rune('1') {
+								goto l813
+							}
+							position++
+						}
+					l816:
+						goto l812
+					l813:
+						position, tokenIndex = position813, tokenIndex813
+					}
+					goto l808
+				l809:
+					position, tokenIndex = position808, tokenIndex808
+					if buffer[position] != rune('0') {
+						goto l818
 					}
 					position++
-				l822:
+					{
+						position819, tokenIndex819 := position, tokenIndex
+						if buffer[position] != rune('x') {
+							goto l820
+						}
+						position++
+						goto l819
+					l820:
+						position, tokenIndex = position819, tokenIndex819
+						if buffer[position] != rune('X') {
+							goto l818
+						}
+						position++
+					}
+				l819:
 					{
 						position823, tokenIndex823 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l823
+							goto l824
 						}
 						position++
-						goto l822
-					l823:
+						goto l823
+					l824:
 						position, tokenIndex = position823, tokenIndex823
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l825
+						}
+						position++
+						goto l823
+					l825:
+						position, tokenIndex = position823, tokenIndex823
+						{
+							position826, tokenIndex826 := position, tokenIndex
+							if c := buffer[position]; c < rune('a') || c > rune('f') {
+								goto l827
+							}
+							position++
+							goto l826
+						l827:
+							position, tokenIndex = position826, tokenIndex826
+							if c := buffer[position]; c < rune('A') || c > rune('F') {
+								goto l818
+							}
+							position++
+						}
+					l826:
+					}
+				l823:
+				l821:
+					{
+						position822, tokenIndex822 := position, tokenIndex
+						{
+							position828, tokenIndex828 := position, tokenIndex
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l829
+							}
+							position++
+							goto l828
+						l829:
+							position, tokenIndex = position828, tokenIndex828
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l830
+							}
+							position++
+							goto l828
+						l830:
+							position, tokenIndex = position828, tokenIndex828
+							{
+								position831, tokenIndex831 := position, tokenIndex
+								if c := buffer[position]; c < rune('a') || c > rune('f') {
+									goto l832
+								}
+								position++
+								goto l831
+							l832:
+								position, tokenIndex = position831, tokenIndex831
+								if c := buffer[position]; c < rune('A') || c > rune('F') {
+									goto l822
+								}
+								position++
+							}
+						l831:
+						}
+					l828:
+						goto l821
+					l822:
+						position, tokenIndex = position822, tokenIndex822
+					}
+					goto l808
+				l818:
+					position, tokenIndex = position808, tokenIndex808
+					if c := buffer[position]; c < rune('0') || c > rune('9') {
+						goto l802
+					}
+					position++
+				l833:
+					{
+						position834, tokenIndex834 := position, tokenIndex
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l834
+						}
+						position++
+						goto l833
+					l834:
+						position, tokenIndex = position834, tokenIndex834
 					}
 				}
-			l797:
-				add(ruleOffset, position792)
+			l808:
+				add(ruleOffset, position803)
 			}
 			return true
-		l791:
-			position, tokenIndex = position791, tokenIndex791
+		l802:
+			position, tokenIndex = position802, tokenIndex802
 			return false
 		},
 		/* 52 Section <- <([a-z] / [A-Z] / '@')+> */
 		func() bool {
-			position824, tokenIndex824 := position, tokenIndex
+			position835, tokenIndex835 := position, tokenIndex
 			{
-				position825 := position
+				position836 := position
 				{
-					position828, tokenIndex828 := position, tokenIndex
+					position839, tokenIndex839 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l829
+						goto l840
 					}
 					position++
-					goto l828
-				l829:
-					position, tokenIndex = position828, tokenIndex828
+					goto l839
+				l840:
+					position, tokenIndex = position839, tokenIndex839
 					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l830
+						goto l841
 					}
 					position++
-					goto l828
-				l830:
-					position, tokenIndex = position828, tokenIndex828
+					goto l839
+				l841:
+					position, tokenIndex = position839, tokenIndex839
 					if buffer[position] != rune('@') {
-						goto l824
+						goto l835
 					}
 					position++
 				}
-			l828:
-			l826:
+			l839:
+			l837:
 				{
-					position827, tokenIndex827 := position, tokenIndex
+					position838, tokenIndex838 := position, tokenIndex
 					{
-						position831, tokenIndex831 := position, tokenIndex
+						position842, tokenIndex842 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l832
+							goto l843
 						}
 						position++
-						goto l831
-					l832:
-						position, tokenIndex = position831, tokenIndex831
+						goto l842
+					l843:
+						position, tokenIndex = position842, tokenIndex842
 						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l833
+							goto l844
 						}
 						position++
-						goto l831
-					l833:
-						position, tokenIndex = position831, tokenIndex831
+						goto l842
+					l844:
+						position, tokenIndex = position842, tokenIndex842
 						if buffer[position] != rune('@') {
-							goto l827
+							goto l838
 						}
 						position++
 					}
-				l831:
-					goto l826
-				l827:
-					position, tokenIndex = position827, tokenIndex827
+				l842:
+					goto l837
+				l838:
+					position, tokenIndex = position838, tokenIndex838
 				}
-				add(ruleSection, position825)
+				add(ruleSection, position836)
 			}
 			return true
-		l824:
-			position, tokenIndex = position824, tokenIndex824
+		l835:
+			position, tokenIndex = position835, tokenIndex835
 			return false
 		},
 		/* 53 SegmentRegister <- <('%' ([c-g] / 's') ('s' ':'))> */
 		func() bool {
-			position834, tokenIndex834 := position, tokenIndex
+			position845, tokenIndex845 := position, tokenIndex
 			{
-				position835 := position
+				position846 := position
 				if buffer[position] != rune('%') {
-					goto l834
+					goto l845
 				}
 				position++
 				{
-					position836, tokenIndex836 := position, tokenIndex
+					position847, tokenIndex847 := position, tokenIndex
 					if c := buffer[position]; c < rune('c') || c > rune('g') {
-						goto l837
+						goto l848
 					}
 					position++
-					goto l836
-				l837:
-					position, tokenIndex = position836, tokenIndex836
+					goto l847
+				l848:
+					position, tokenIndex = position847, tokenIndex847
 					if buffer[position] != rune('s') {
-						goto l834
+						goto l845
 					}
 					position++
 				}
-			l836:
+			l847:
 				if buffer[position] != rune('s') {
-					goto l834
+					goto l845
 				}
 				position++
 				if buffer[position] != rune(':') {
-					goto l834
+					goto l845
 				}
 				position++
-				add(ruleSegmentRegister, position835)
+				add(ruleSegmentRegister, position846)
 			}
 			return true
-		l834:
-			position, tokenIndex = position834, tokenIndex834
+		l845:
+			position, tokenIndex = position845, tokenIndex845
 			return false
 		},
 	}
 	p.rules = _rules
+	return nil
 }