Maintain a frame pointer in aesni-gcm-x86_64.pl and add SEH unwind codes

Some profiling systems cannot unwind with CFI and benefit from having a
frame pointer. Since this code doesn't have enough register pressure to
actually need to use rbp as a general register, this change tweaks
things so that a frame pointer is preserved.

As this would invalidate the SEH handler, just replace it with proper
unwind codes, which are more profiler-friendly and supportable by our
unwind tests. Some notes on this:

- We don't currently support the automatic calling convention conversion
  with unwind codes, but this file already puts all arguments in
  registers, so I just renamed the arguments and put the last two
  arguments in RDI and RSI. Those I stashed into the parameter stack
  area because it's free storage.

- It is tedious to write the same directives in both CFI and SEH. We
  really could do with an abstraction. Although since most of our
  functions need a Windows variation anyway.

- I restored the original file's use of PUSH to save the registers.
  This matches what Clang likes to output anyway, and push is probably
  smaller than the corresponding move with offset. (And it reduces how
  much thinking about offsets I need to do.)

- Although it's an extra instruction, I restored the original file's
  separate fixed stack allocation and alloca for the sake of clarity.

- The epilog is constrained by Windows being extremely picky about
  epilogs. (Windows doesn't annotate epilogs and instead simulates
  forward.) I think other options are possible, but using LEA with an
  offset to realign the stack for the POPs both matches the examples in
  Windows and what Clang seems to like to output. The original file used
  MOV with offset, but it seems to be related to the funny SEH handler.

- The offsets in SEH directives may be surprising to someone used to CFI
  directives or a SysV RBP frame pointer. All three use slightly
  different baselines:

  CFI's canonical frame address (CFA) is RSP just before a CALL (so
  before the saved RIP in stack order). It is 16-byte aligned by ABI.

  A SysV RBP frame pointer is 16 bytes after that, after a saved RIP and
  saved RBP. It is also 16-byte aligned.

  Windows' baseline is the top of the fixed stack allocation, so
  potentially some bytes after that (all pushreg and allocstack
  directives). This too is required to be 16-byte aligned.

  Windows, however, doesn't require the frame register actually contain
  the fixed stack allocation. You can specify an offset from the value
  in the register to the actual top. But all the offsets in savereg,
  etc., directives use this baseline.

Performance difference is within measurement noise.

This does not create a stack frame for internal functions so
frame-pointer unwinding may miss a function or two, but the broad
attribution will be correct.

Change originally by Clemens Fruhwirth. Then reworked from Adam
Langley's https://boringssl-review.googlesource.com/c/boringssl/+/55945
by me to work on Windows and fix up some issues with the RBP setup.

Bug: b/33072965, 259
Change-Id: I52302635a8ad3d9272404feac125e2a4a4a5d14c
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/56128
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl b/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
index 21dbf69..e726c04 100644
--- a/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
@@ -71,14 +71,18 @@
 # no AVX2 instructions being used.
 if ($avx>1) {{{
 
-($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# On Windows, only four parameters are passed in registers. The last two
+# parameters will be manually loaded into %rdi and %rsi.
+my ($inp, $out, $len, $key, $ivp, $Xip) =
+    $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") :
+             ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9");
 
 ($Ii,$T1,$T2,$Hkey,
  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
 
 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
 
-($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15");
 
 $code=<<___;
 .text
@@ -390,7 +394,7 @@
 	  vaesenclast	$Hkey,$inout5,$inout5
 	 vpaddb		$T2,$Z3,$Hkey
 
-	add		\$0x60,$ret
+	add		\$0x60,%rax
 	sub		\$0x6,$len
 	jc		.L6x_done
 
@@ -424,46 +428,76 @@
 #		struct { u128 Xi,H,Htbl[9]; } *Xip);
 $code.=<<___;
 .globl	aesni_gcm_decrypt
-.type	aesni_gcm_decrypt,\@function,6
+.type	aesni_gcm_decrypt,\@abi-omnipotent
 .align	32
 aesni_gcm_decrypt:
 .cfi_startproc
-	xor	$ret,$ret
+.seh_startproc
+	xor	%rax,%rax
 
 	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
 	# bytes of input.
 	cmp	\$0x60,$len			# minimal accepted length
 	jb	.Lgcm_dec_abort
 
-	lea	(%rsp),%rax			# save stack pointer
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
+.seh_pushreg	%rbp
+	mov	%rsp, %rbp			# save stack pointer
+.cfi_def_cfa_register	%rbp
+	push	%rbx
+.cfi_push	%rbx
+.seh_pushreg	%rbx
 	push	%r12
 .cfi_push	%r12
+.seh_pushreg	%r12
 	push	%r13
 .cfi_push	%r13
+.seh_pushreg	%r13
 	push	%r14
 .cfi_push	%r14
+.seh_pushreg	%r14
 	push	%r15
 .cfi_push	%r15
+.seh_pushreg	%r15
 ___
-$code.=<<___ if ($win64);
-	lea	-0xa8(%rsp),%rsp
-	movaps	%xmm6,-0xd8(%rax)
-	movaps	%xmm7,-0xc8(%rax)
-	movaps	%xmm8,-0xb8(%rax)
-	movaps	%xmm9,-0xa8(%rax)
-	movaps	%xmm10,-0x98(%rax)
-	movaps	%xmm11,-0x88(%rax)
-	movaps	%xmm12,-0x78(%rax)
-	movaps	%xmm13,-0x68(%rax)
-	movaps	%xmm14,-0x58(%rax)
-	movaps	%xmm15,-0x48(%rax)
-.Lgcm_dec_body:
+if ($win64) {
+$code.=<<___
+	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
+.seh_allocstack	0xa8
+.seh_setframe	%rbp, 0xa8+5*8
+	# Load the last two parameters. These go into %rdi and %rsi, which are
+	# non-volatile on Windows, so stash them in the parameter stack area
+	# first.
+	mov	%rdi, 0x10(%rbp)
+.seh_savereg	%rdi, 0xa8+5*8+0x10
+	mov	%rsi, 0x18(%rbp)
+.seh_savereg	%rsi, 0xa8+5*8+0x18
+	mov	0x30(%rbp), $ivp
+	mov	0x38(%rbp), $Xip
+	# Save non-volatile XMM registers.
+	movaps	%xmm6,-0xd0(%rbp)
+.seh_savexmm128	%xmm6, 0xa8+5*8-0xd0
+	movaps	%xmm7,-0xc0(%rbp)
+.seh_savexmm128	%xmm7, 0xa8+5*8-0xc0
+	movaps	%xmm8,-0xb0(%rbp)
+.seh_savexmm128	%xmm8, 0xa8+5*8-0xb0
+	movaps	%xmm9,-0xa0(%rbp)
+.seh_savexmm128	%xmm9, 0xa8+5*8-0xa0
+	movaps	%xmm10,-0x90(%rbp)
+.seh_savexmm128	%xmm10, 0xa8+5*8-0x90
+	movaps	%xmm11,-0x80(%rbp)
+.seh_savexmm128	%xmm11, 0xa8+5*8-0x80
+	movaps	%xmm12,-0x70(%rbp)
+.seh_savexmm128	%xmm12, 0xa8+5*8-0x70
+	movaps	%xmm13,-0x60(%rbp)
+.seh_savexmm128	%xmm13, 0xa8+5*8-0x60
+	movaps	%xmm14,-0x50(%rbp)
+.seh_savexmm128	%xmm14, 0xa8+5*8-0x50
+	movaps	%xmm15,-0x40(%rbp)
+.seh_savexmm128	%xmm15, 0xa8+5*8-0x40
 ___
+}
 $code.=<<___;
 	vzeroupper
 
@@ -491,7 +525,7 @@
 .Ldec_no_key_aliasing:
 
 	vmovdqu		0x50($inp),$Z3		# I[5]
-	lea		($inp),$in0
+	mov		$inp,$in0
 	vmovdqu		0x40($inp),$Z0
 
 	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
@@ -504,7 +538,7 @@
 
 	vmovdqu		0x30($inp),$Z1
 	shr		\$4,$len
-	xor		$ret,$ret
+	xor		%rax,%rax
 	vmovdqu		0x20($inp),$Z2
 	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
 	vmovdqu		0x10($inp),$T2
@@ -535,35 +569,37 @@
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
-	movaps	-0xd8(%rax),%xmm6
-	movaps	-0xc8(%rax),%xmm7
-	movaps	-0xb8(%rax),%xmm8
-	movaps	-0xa8(%rax),%xmm9
-	movaps	-0x98(%rax),%xmm10
-	movaps	-0x88(%rax),%xmm11
-	movaps	-0x78(%rax),%xmm12
-	movaps	-0x68(%rax),%xmm13
-	movaps	-0x58(%rax),%xmm14
-	movaps	-0x48(%rax),%xmm15
+	movaps	-0xd0(%rbp),%xmm6
+	movaps	-0xc0(%rbp),%xmm7
+	movaps	-0xb0(%rbp),%xmm8
+	movaps	-0xa0(%rbp),%xmm9
+	movaps	-0x90(%rbp),%xmm10
+	movaps	-0x80(%rbp),%xmm11
+	movaps	-0x70(%rbp),%xmm12
+	movaps	-0x60(%rbp),%xmm13
+	movaps	-0x50(%rbp),%xmm14
+	movaps	-0x40(%rbp),%xmm15
+	mov	0x10(%rbp),%rdi
+	mov	0x18(%rbp),%rsi
 ___
 $code.=<<___;
-	mov	-48(%rax),%r15
-.cfi_restore	%r15
-	mov	-40(%rax),%r14
-.cfi_restore	%r14
-	mov	-32(%rax),%r13
-.cfi_restore	%r13
-	mov	-24(%rax),%r12
-.cfi_restore	%r12
-	mov	-16(%rax),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rax),%rbx
-.cfi_restore	%rbx
-	lea	(%rax),%rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
+	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
+.cfi_def_cfa	%rsp, 0x38
+	pop	%r15
+.cfi_pop	%r15
+	pop	%r14
+.cfi_pop	%r14
+	pop	%r13
+.cfi_pop	%r13
+	pop	%r12
+.cfi_pop	%r12
+	pop	%rbx
+.cfi_pop	%rbx
+	pop	%rbp
+.cfi_pop	%rbp
 .Lgcm_dec_abort:
-	mov	$ret,%rax		# return value
 	ret
+.seh_endproc
 .cfi_endproc
 .size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
 ___
@@ -663,15 +699,16 @@
 .size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
 
 .globl	aesni_gcm_encrypt
-.type	aesni_gcm_encrypt,\@function,6
+.type	aesni_gcm_encrypt,\@abi-omnipotent
 .align	32
 aesni_gcm_encrypt:
 .cfi_startproc
+.seh_startproc
 #ifdef BORINGSSL_DISPATCH_TEST
 .extern	BORINGSSL_function_hit
 	movb \$1,BORINGSSL_function_hit+2(%rip)
 #endif
-	xor	$ret,$ret
+	xor	%rax,%rax
 
 	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
 	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
@@ -679,35 +716,64 @@
 	cmp	\$0x60*3,$len			# minimal accepted length
 	jb	.Lgcm_enc_abort
 
-	lea	(%rsp),%rax			# save stack pointer
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
 	push	%rbp
 .cfi_push	%rbp
+.seh_pushreg	%rbp
+	mov	%rsp, %rbp			# save stack pointer
+.cfi_def_cfa_register	%rbp
+	push	%rbx
+.cfi_push	%rbx
+.seh_pushreg	%rbx
 	push	%r12
 .cfi_push	%r12
+.seh_pushreg	%r12
 	push	%r13
 .cfi_push	%r13
+.seh_pushreg	%r13
 	push	%r14
 .cfi_push	%r14
+.seh_pushreg	%r14
 	push	%r15
 .cfi_push	%r15
+.seh_pushreg	%r15
 ___
-$code.=<<___ if ($win64);
-	lea	-0xa8(%rsp),%rsp
-	movaps	%xmm6,-0xd8(%rax)
-	movaps	%xmm7,-0xc8(%rax)
-	movaps	%xmm8,-0xb8(%rax)
-	movaps	%xmm9,-0xa8(%rax)
-	movaps	%xmm10,-0x98(%rax)
-	movaps	%xmm11,-0x88(%rax)
-	movaps	%xmm12,-0x78(%rax)
-	movaps	%xmm13,-0x68(%rax)
-	movaps	%xmm14,-0x58(%rax)
-	movaps	%xmm15,-0x48(%rax)
-.Lgcm_enc_body:
+if ($win64) {
+$code.=<<___
+	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
+.seh_allocstack	0xa8
+.seh_setframe	%rbp, 0xa8+5*8
+	# Load the last two parameters. These go into %rdi and %rsi, which are
+	# non-volatile on Windows, so stash them in the parameter stack area
+	# first.
+	mov	%rdi, 0x10(%rbp)
+.seh_savereg	%rdi, 0xa8+5*8+0x10
+	mov	%rsi, 0x18(%rbp)
+.seh_savereg	%rsi, 0xa8+5*8+0x18
+	mov	0x30(%rbp), $ivp
+	mov	0x38(%rbp), $Xip
+	# Save non-volatile XMM registers.
+	movaps	%xmm6,-0xd0(%rbp)
+.seh_savexmm128	%xmm6, 0xa8+5*8-0xd0
+	movaps	%xmm7,-0xc0(%rbp)
+.seh_savexmm128	%xmm7, 0xa8+5*8-0xc0
+	movaps	%xmm8,-0xb0(%rbp)
+.seh_savexmm128	%xmm8, 0xa8+5*8-0xb0
+	movaps	%xmm9,-0xa0(%rbp)
+.seh_savexmm128	%xmm9, 0xa8+5*8-0xa0
+	movaps	%xmm10,-0x90(%rbp)
+.seh_savexmm128	%xmm10, 0xa8+5*8-0x90
+	movaps	%xmm11,-0x80(%rbp)
+.seh_savexmm128	%xmm11, 0xa8+5*8-0x80
+	movaps	%xmm12,-0x70(%rbp)
+.seh_savexmm128	%xmm12, 0xa8+5*8-0x70
+	movaps	%xmm13,-0x60(%rbp)
+.seh_savexmm128	%xmm13, 0xa8+5*8-0x60
+	movaps	%xmm14,-0x50(%rbp)
+.seh_savexmm128	%xmm14, 0xa8+5*8-0x50
+	movaps	%xmm15,-0x40(%rbp)
+.seh_savexmm128	%xmm15, 0xa8+5*8-0x40
 ___
+}
 $code.=<<___;
 	vzeroupper
 
@@ -731,7 +797,7 @@
 	sub		$end0,%rsp		# avoid aliasing with key
 .Lenc_no_key_aliasing:
 
-	lea		($out),$in0
+	mov		$out,$in0
 
 	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
 	# bytes before the end of the input. Note, in particular, that this is
@@ -762,7 +828,7 @@
 	vmovdqu		($Xip),$Xi		# load Xi
 	lea		0x20+0x20($Xip),$Xip	# size optimization
 	sub		\$12,$len
-	mov		\$0x60*2,$ret
+	mov		\$0x60*2,%rax
 	vpshufb		$Ii,$Xi,$Xi
 
 	call		_aesni_ctr32_ghash_6x
@@ -951,37 +1017,39 @@
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
-	movaps	-0xd8(%rax),%xmm6
-	movaps	-0xc8(%rax),%xmm7
-	movaps	-0xb8(%rax),%xmm8
-	movaps	-0xa8(%rax),%xmm9
-	movaps	-0x98(%rax),%xmm10
-	movaps	-0x88(%rax),%xmm11
-	movaps	-0x78(%rax),%xmm12
-	movaps	-0x68(%rax),%xmm13
-	movaps	-0x58(%rax),%xmm14
-	movaps	-0x48(%rax),%xmm15
+	movaps	-0xd0(%rbp),%xmm6
+	movaps	-0xc0(%rbp),%xmm7
+	movaps	-0xb0(%rbp),%xmm8
+	movaps	-0xa0(%rbp),%xmm9
+	movaps	-0x90(%rbp),%xmm10
+	movaps	-0x80(%rbp),%xmm11
+	movaps	-0x70(%rbp),%xmm12
+	movaps	-0x60(%rbp),%xmm13
+	movaps	-0x50(%rbp),%xmm14
+	movaps	-0x40(%rbp),%xmm15
+	mov	0x10(%rbp),%rdi
+	mov	0x18(%rbp),%rsi
 ___
 $code.=<<___;
-	mov	-48(%rax),%r15
-.cfi_restore	%r15
-	mov	-40(%rax),%r14
-.cfi_restore	%r14
-	mov	-32(%rax),%r13
-.cfi_restore	%r13
-	mov	-24(%rax),%r12
-.cfi_restore	%r12
-	mov	-16(%rax),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rax),%rbx
-.cfi_restore	%rbx
-	lea	(%rax),%rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
+	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
+.cfi_def_cfa	%rsp, 0x38
+	pop	%r15
+.cfi_pop	%r15
+	pop	%r14
+.cfi_pop	%r14
+	pop	%r13
+.cfi_pop	%r13
+	pop	%r12
+.cfi_pop	%r12
+	pop	%rbx
+.cfi_pop	%rbx
+	pop	%rbp
+.cfi_pop	%rbp
 .Lgcm_enc_abort:
-	mov	$ret,%rax		# return value
 	ret
+.seh_endproc
 .cfi_endproc
-.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
 ___
 
 $code.=<<___;
@@ -999,127 +1067,6 @@
 .asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 ___
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___
-.extern	__imp_RtlVirtualUnwind
-.type	gcm_se_handler,\@abi-omnipotent
-.align	16
-gcm_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lcommon_seh_tail
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lcommon_seh_tail
-
-	mov	120($context),%rax	# pull context->Rax
-
-	mov	-48(%rax),%r15
-	mov	-40(%rax),%r14
-	mov	-32(%rax),%r13
-	mov	-24(%rax),%r12
-	mov	-16(%rax),%rbp
-	mov	-8(%rax),%rbx
-	mov	%r15,240($context)
-	mov	%r14,232($context)
-	mov	%r13,224($context)
-	mov	%r12,216($context)
-	mov	%rbp,160($context)
-	mov	%rbx,144($context)
-
-	lea	-0xd8(%rax),%rsi	# %xmm save area
-	lea	512($context),%rdi	# & context.Xmm6
-	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
-	.long	0xa548f3fc		# cld; rep movsq
-
-.Lcommon_seh_tail:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	mov	40($disp),%rdi		# disp->ContextRecord
-	mov	$context,%rsi		# context
-	mov	\$154,%ecx		# sizeof(CONTEXT)
-	.long	0xa548f3fc		# cld; rep movsq
-
-	mov	$disp,%rsi
-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
-	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
-	mov	0(%rsi),%r8		# arg3, disp->ControlPc
-	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
-	mov	40(%rsi),%r10		# disp->ContextRecord
-	lea	56(%rsi),%r11		# &disp->HandlerData
-	lea	24(%rsi),%r12		# &disp->EstablisherFrame
-	mov	%r10,32(%rsp)		# arg5
-	mov	%r11,40(%rsp)		# arg6
-	mov	%r12,48(%rsp)		# arg7
-	mov	%rcx,56(%rsp)		# arg8, (NULL)
-	call	*__imp_RtlVirtualUnwind(%rip)
-
-	mov	\$1,%eax		# ExceptionContinueSearch
-	add	\$64,%rsp
-	popfq
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	pop	%rdi
-	pop	%rsi
-	ret
-.size	gcm_se_handler,.-gcm_se_handler
-
-.section	.pdata
-.align	4
-	.rva	.LSEH_begin_aesni_gcm_decrypt
-	.rva	.LSEH_end_aesni_gcm_decrypt
-	.rva	.LSEH_gcm_dec_info
-
-	.rva	.LSEH_begin_aesni_gcm_encrypt
-	.rva	.LSEH_end_aesni_gcm_encrypt
-	.rva	.LSEH_gcm_enc_info
-.section	.xdata
-.align	8
-.LSEH_gcm_dec_info:
-	.byte	9,0,0,0
-	.rva	gcm_se_handler
-	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
-.LSEH_gcm_enc_info:
-	.byte	9,0,0,0
-	.rva	gcm_se_handler
-	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
-___
-}
 }}} else {{{
 $code=<<___;	# assembler is too old
 .text
diff --git a/crypto/fipsmodule/modes/gcm_test.cc b/crypto/fipsmodule/modes/gcm_test.cc
index 8d15cc6..324d0e8 100644
--- a/crypto/fipsmodule/modes/gcm_test.cc
+++ b/crypto/fipsmodule/modes/gcm_test.cc
@@ -173,17 +173,17 @@
 
         aes_hw_set_encrypt_key(kKey, 128, &aes_key);
         for (size_t blocks : kBlockCounts) {
-          CHECK_ABI(aesni_gcm_encrypt, buf, buf, blocks * 16, &aes_key, iv,
-                    gcm.Xi.u);
-          CHECK_ABI(aesni_gcm_encrypt, buf, buf, blocks * 16 + 7, &aes_key, iv,
-                    gcm.Xi.u);
+          CHECK_ABI_SEH(aesni_gcm_encrypt, buf, buf, blocks * 16, &aes_key, iv,
+                        gcm.Xi.u);
+          CHECK_ABI_SEH(aesni_gcm_encrypt, buf, buf, blocks * 16 + 7, &aes_key,
+                        iv, gcm.Xi.u);
         }
         aes_hw_set_decrypt_key(kKey, 128, &aes_key);
         for (size_t blocks : kBlockCounts) {
-          CHECK_ABI(aesni_gcm_decrypt, buf, buf, blocks * 16, &aes_key, iv,
-                    gcm.Xi.u);
-          CHECK_ABI(aesni_gcm_decrypt, buf, buf, blocks * 16 + 7, &aes_key, iv,
-                    gcm.Xi.u);
+          CHECK_ABI_SEH(aesni_gcm_decrypt, buf, buf, blocks * 16, &aes_key, iv,
+                        gcm.Xi.u);
+          CHECK_ABI_SEH(aesni_gcm_decrypt, buf, buf, blocks * 16 + 7, &aes_key,
+                        iv, gcm.Xi.u);
         }
       }
     }