Remove SSE2 checks in 32-bit x86 assembly

We've made crypto/internal.h require SSE2 support for a few months now
without much fuss. Finish the job and remove the fallback paths. We've
never tested any of these paths, and this removes a slew of
OPENSSL_ia32cap_P references from the assembly.

Bug: 673
Change-Id: I446a033d132af5038ab427b8560cbf20c1d97335
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68207
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/bn/asm/bn-586.pl b/crypto/fipsmodule/bn/asm/bn-586.pl
index eae6da9..e4fc0f9 100644
--- a/crypto/fipsmodule/bn/asm/bn-586.pl
+++ b/crypto/fipsmodule/bn/asm/bn-586.pl
@@ -18,8 +18,6 @@
 
 $sse2=1;
 
-&external_label("OPENSSL_ia32cap_P") if ($sse2);
-
 &bn_mul_add_words("bn_mul_add_words");
 &bn_mul_words("bn_mul_words");
 &bn_sqr_words("bn_sqr_words");
@@ -35,17 +33,13 @@
 	{
 	local($name)=@_;
 
-	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+	&function_begin_B($name);
 
 	$r="eax";
 	$a="edx";
 	$c="ecx";
 
 	if ($sse2) {
-		&picmeup("eax","OPENSSL_ia32cap_P");
-		&bt(&DWP(0,"eax"),26);
-		&jnc(&label("maw_non_sse2"));
-
 		&mov($r,&wparam(0));
 		&mov($a,&wparam(1));
 		&mov($c,&wparam(2));
@@ -135,85 +129,7 @@
 		&movd("eax","mm1");		# c = carry_out
 		&emms();
 		&ret();
-
-	&set_label("maw_non_sse2",16);
 	}
-
-	# function_begin prologue
-	&push("ebp");
-	&push("ebx");
-	&push("esi");
-	&push("edi");
-
-	&comment("");
-	$Low="eax";
-	$High="edx";
-	$a="ebx";
-	$w="ebp";
-	$r="edi";
-	$c="esi";
-
-	&xor($c,$c);		# clear carry
-	&mov($r,&wparam(0));	#
-
-	&mov("ecx",&wparam(2));	#
-	&mov($a,&wparam(1));	#
-
-	&and("ecx",0xfffffff8);	# num / 8
-	&mov($w,&wparam(3));	#
-
-	&push("ecx");		# Up the stack for a tmp variable
-
-	&jz(&label("maw_finish"));
-
-	&set_label("maw_loop",16);
-
-	for ($i=0; $i<32; $i+=4)
-		{
-		&comment("Round $i");
-
-		 &mov("eax",&DWP($i,$a)); 	# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);			# L(t)+= c
-		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",&DWP($i,$r));	# L(t)+= *r
-		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
-		&mov($c,"edx");			# c=  H(t);
-		}
-
-	&comment("");
-	&sub("ecx",8);
-	&lea($a,&DWP(32,$a));
-	&lea($r,&DWP(32,$r));
-	&jnz(&label("maw_loop"));
-
-	&set_label("maw_finish",0);
-	&mov("ecx",&wparam(2));	# get num
-	&and("ecx",7);
-	&jnz(&label("maw_finish2"));	# helps branch prediction
-	&jmp(&label("maw_end"));
-
-	&set_label("maw_finish2",1);
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		 &mov("eax",&DWP($i*4,$a));	# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);			# L(t)+=c
-		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
-		&adc("edx",0);			# H(t)+=carry
-		 &dec("ecx") if ($i != 7-1);
-		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
-		 &mov($c,"edx");		# c=  H(t);
-		&jz(&label("maw_end")) if ($i != 7-1);
-		}
-	&set_label("maw_end",0);
-	&mov("eax",$c);
-
-	&pop("ecx");	# clear variable from
-
 	&function_end($name);
 	}
 
@@ -221,17 +137,13 @@
 	{
 	local($name)=@_;
 
-	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+	&function_begin_B($name);
 
 	$r="eax";
 	$a="edx";
 	$c="ecx";
 
 	if ($sse2) {
-		&picmeup("eax","OPENSSL_ia32cap_P");
-		&bt(&DWP(0,"eax"),26);
-		&jnc(&label("mw_non_sse2"));
-
 		&mov($r,&wparam(0));
 		&mov($a,&wparam(1));
 		&mov($c,&wparam(2));
@@ -252,79 +164,7 @@
 		&movd("eax","mm1");		# return carry
 		&emms();
 		&ret();
-	&set_label("mw_non_sse2",16);
 	}
-
-	# function_begin prologue
-	&push("ebp");
-	&push("ebx");
-	&push("esi");
-	&push("edi");
-
-	&comment("");
-	$Low="eax";
-	$High="edx";
-	$a="ebx";
-	$w="ecx";
-	$r="edi";
-	$c="esi";
-	$num="ebp";
-
-	&xor($c,$c);		# clear carry
-	&mov($r,&wparam(0));	#
-	&mov($a,&wparam(1));	#
-	&mov($num,&wparam(2));	#
-	&mov($w,&wparam(3));	#
-
-	&and($num,0xfffffff8);	# num / 8
-	&jz(&label("mw_finish"));
-
-	&set_label("mw_loop",0);
-	for ($i=0; $i<32; $i+=4)
-		{
-		&comment("Round $i");
-
-		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);			# L(t)+=c
-		 # XXX
-
-		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
-
-		&mov($c,"edx");			# c=  H(t);
-		}
-
-	&comment("");
-	&add($a,32);
-	&add($r,32);
-	&sub($num,8);
-	&jz(&label("mw_finish"));
-	&jmp(&label("mw_loop"));
-
-	&set_label("mw_finish",0);
-	&mov($num,&wparam(2));	# get num
-	&and($num,7);
-	&jnz(&label("mw_finish2"));
-	&jmp(&label("mw_end"));
-
-	&set_label("mw_finish2",1);
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		 &mov("eax",&DWP($i*4,$a,"",0));# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);			# L(t)+=c
-		 # XXX
-		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
-		&mov($c,"edx");			# c=  H(t);
-		 &dec($num) if ($i != 7-1);
-		&jz(&label("mw_end")) if ($i != 7-1);
-		}
-	&set_label("mw_end",0);
-	&mov("eax",$c);
-
 	&function_end($name);
 	}
 
@@ -332,17 +172,13 @@
 	{
 	local($name)=@_;
 
-	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+	&function_begin_B($name);
 
 	$r="eax";
 	$a="edx";
 	$c="ecx";
 
 	if ($sse2) {
-		&picmeup("eax","OPENSSL_ia32cap_P");
-		&bt(&DWP(0,"eax"),26);
-		&jnc(&label("sqr_non_sse2"));
-
 		&mov($r,&wparam(0));
 		&mov($a,&wparam(1));
 		&mov($c,&wparam(2));
@@ -358,62 +194,7 @@
 
 		&emms();
 		&ret();
-	&set_label("sqr_non_sse2",16);
 	}
-
-	# function_begin prologue
-	&push("ebp");
-	&push("ebx");
-	&push("esi");
-	&push("edi");
-
-	&comment("");
-	$r="esi";
-	$a="edi";
-	$num="ebx";
-
-	&mov($r,&wparam(0));	#
-	&mov($a,&wparam(1));	#
-	&mov($num,&wparam(2));	#
-
-	&and($num,0xfffffff8);	# num / 8
-	&jz(&label("sw_finish"));
-
-	&set_label("sw_loop",0);
-	for ($i=0; $i<32; $i+=4)
-		{
-		&comment("Round $i");
-		&mov("eax",&DWP($i,$a,"",0)); 	# *a
-		 # XXX
-		&mul("eax");			# *a * *a
-		&mov(&DWP($i*2,$r,"",0),"eax");	#
-		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
-		}
-
-	&comment("");
-	&add($a,32);
-	&add($r,64);
-	&sub($num,8);
-	&jnz(&label("sw_loop"));
-
-	&set_label("sw_finish",0);
-	&mov($num,&wparam(2));	# get num
-	&and($num,7);
-	&jz(&label("sw_end"));
-
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		&mov("eax",&DWP($i*4,$a,"",0));	# *a
-		 # XXX
-		&mul("eax");			# *a * *a
-		&mov(&DWP($i*8,$r,"",0),"eax");	#
-		 &dec($num) if ($i != 7-1);
-		&mov(&DWP($i*8+4,$r,"",0),"edx");
-		 &jz(&label("sw_end")) if ($i != 7-1);
-		}
-	&set_label("sw_end",0);
-
 	&function_end($name);
 	}
 
diff --git a/crypto/fipsmodule/bn/asm/x86-mont.pl b/crypto/fipsmodule/bn/asm/x86-mont.pl
index c097574..7b7cca7 100755
--- a/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -44,8 +44,6 @@
 
 $sse2=1;
 
-&external_label("OPENSSL_ia32cap_P") if ($sse2);
-
 &function_begin("bn_mul_mont");
 
 $i="edx";
@@ -146,10 +144,6 @@
 $temp="mm6";
 $mask="mm7";
 
-	&picmeup("eax","OPENSSL_ia32cap_P");
-	&bt	(&DWP(0,"eax"),26);
-	&jnc	(&label("non_sse2"));
-
 	&mov	("eax",-1);
 	&movd	($mask,"eax");		# mask 32 lower bits
 
@@ -291,298 +285,6 @@
 
 	&emms	();				# done with mmx bank
 	&jmp	(&label("common_tail"));
-
-&set_label("non_sse2",16);
-}
-
-if (0) {
-	&mov	("esp",$_sp);
-	&xor	("eax","eax");	# signal "not fast enough [yet]"
-	&jmp	(&label("just_leave"));
-	# While the below code provides competitive performance for
-	# all key lengths on modern Intel cores, it's still more
-	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
-	# means compared to the original integer-only assembler.
-	# 512-bit RSA sign is better by ~40%, but that's about all
-	# one can say about all CPUs...
-} else {
-$inp="esi";	# integer path uses these registers differently
-$word="edi";
-$carry="ebp";
-
-	&mov	($inp,$_ap);
-	&lea	($carry,&DWP(1,$num));
-	&mov	($word,$_bp);
-	&xor	($j,$j);				# j=0
-	&mov	("edx",$inp);
-	&and	($carry,1);				# see if num is even
-	&sub	("edx",$word);				# see if ap==bp
-	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
-	&or	($carry,"edx");
-	&mov	($word,&DWP(0,$word));			# bp[0]
-	&jz	(&label("bn_sqr_mont"));
-	&mov	($_bpend,"eax");
-	&mov	("eax",&DWP(0,$inp));
-	&xor	("edx","edx");
-
-&set_label("mull",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*bp[0]
-	&add	($carry,"eax");
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("mull"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[num-1]*bp[0]
-	 &mov	($word,$_n0);
-	&add	("eax",$carry);
-	 &mov	($inp,$_np);
-	&adc	("edx",0);
-	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-
-	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
-	&xor	($j,$j);
-	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
-	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
-
-	&mov	("eax",&DWP(0,$inp));			# np[0]
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-	&adc	("edx",0);
-	&inc	($j);
-
-	&jmp	(&label("2ndmadd"));
-
-&set_label("1stmadd",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*bp[i]
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
-	&adc	("edx",0);
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("1stmadd"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[num-1]*bp[i]
-	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-	 &mov	($word,$_n0);
-	&adc	("edx",0);
-	 &mov	($inp,$_np);
-	&add	($carry,"eax");
-	&adc	("edx",0);
-	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-
-	&xor	($j,$j);
-	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
-	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
-	&adc	($j,0);
-	 &mov	("eax",&DWP(0,$inp));			# np[0]
-	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
-	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
-
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-	&adc	("edx",0);
-	&mov	($j,1);
-
-&set_label("2ndmadd",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
-	&adc	("edx",0);
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
-	&jl	(&label("2ndmadd"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&adc	("edx",0);
-	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
-
-	&xor	("eax","eax");
-	 &mov	($j,$_bp);				# &bp[i]
-	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
-	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
-	 &lea	($j,&DWP(4,$j));
-	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
-	 &cmp	($j,$_bpend);
-	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
-	&je	(&label("common_tail"));
-
-	&mov	($word,&DWP(0,$j));			# bp[i+1]
-	&mov	($inp,$_ap);
-	&mov	($_bp,$j);				# &bp[++i]
-	&xor	($j,$j);
-	&xor	("edx","edx");
-	&mov	("eax",&DWP(0,$inp));
-	&jmp	(&label("1stmadd"));
-
-&set_label("bn_sqr_mont",16);
-$sbit=$num;
-	&mov	($_num,$num);
-	&mov	($_bp,$j);				# i=0
-
-	&mov	("eax",$word);				# ap[0]
-	&mul	($word);				# ap[0]*ap[0]
-	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
-	&mov	($sbit,"edx");
-	&shr	("edx",1);
-	&and	($sbit,1);
-	&inc	($j);
-&set_label("sqr",16);
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*ap[0]
-	&add	("eax",$carry);
-	&lea	($j,&DWP(1,$j));
-	&adc	("edx",0);
-	&lea	($carry,&DWP(0,$sbit,"eax",2));
-	&shr	("eax",31);
-	&cmp	($j,$_num);
-	&mov	($sbit,"eax");
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("sqr"));
-
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[num-1]*ap[0]
-	&add	("eax",$carry);
-	 &mov	($word,$_n0);
-	&adc	("edx",0);
-	 &mov	($inp,$_np);
-	&lea	($carry,&DWP(0,$sbit,"eax",2));
-	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-	&shr	("eax",31);
-	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
-
-	&lea	($carry,&DWP(0,"eax","edx",2));
-	 &mov	("eax",&DWP(0,$inp));			# np[0]
-	&shr	("edx",31);
-	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
-	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
-
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&mov	($num,$j);
-	&adc	("edx",0);
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-	&mov	($j,1);
-
-&set_label("3rdmadd",16);
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
-	&adc	("edx",0);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
-
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j+1]*m
-	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
-	&lea	($j,&DWP(2,$j));
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
-	&adc	("edx",0);
-	&cmp	($j,$num);
-	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
-	&jl	(&label("3rdmadd"));
-
-	&mov	($carry,"edx");
-	&mul	($word);				# np[j]*m
-	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
-	&adc	("edx",0);
-	&add	($carry,"eax");
-	&adc	("edx",0);
-	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
-
-	&mov	($j,$_bp);				# i
-	&xor	("eax","eax");
-	&mov	($inp,$_ap);
-	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
-	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
-	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
-	&cmp	($j,$num);
-	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
-	&je	(&label("common_tail"));
-
-	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
-	&lea	($j,&DWP(1,$j));
-	&mov	("eax",$word);
-	&mov	($_bp,$j);				# ++i
-	&mul	($word);				# ap[i]*ap[i]
-	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
-	&adc	("edx",0);
-	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
-	&xor	($carry,$carry);
-	&cmp	($j,$num);
-	&lea	($j,&DWP(1,$j));
-	&je	(&label("sqrlast"));
-
-	&mov	($sbit,"edx");				# zaps $num
-	&shr	("edx",1);
-	&and	($sbit,1);
-&set_label("sqradd",16);
-	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
-	&mov	($carry,"edx");
-	&mul	($word);				# ap[j]*ap[i]
-	&add	("eax",$carry);
-	&lea	($carry,&DWP(0,"eax","eax"));
-	&adc	("edx",0);
-	&shr	("eax",31);
-	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
-	&lea	($j,&DWP(1,$j));
-	&adc	("eax",0);
-	&add	($carry,$sbit);
-	&adc	("eax",0);
-	&cmp	($j,$_num);
-	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
-	&mov	($sbit,"eax");
-	&jle	(&label("sqradd"));
-
-	&mov	($carry,"edx");
-	&add	("edx","edx");
-	&shr	($carry,31);
-	&add	("edx",$sbit);
-	&adc	($carry,0);
-&set_label("sqrlast");
-	&mov	($word,$_n0);
-	&mov	($inp,$_np);
-	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
-
-	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
-	&mov	("eax",&DWP(0,$inp));			# np[0]
-	&adc	($carry,0);
-	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
-	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
-
-	&mul	($word);				# np[0]*m
-	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
-	&lea	($num,&DWP(-1,$j));
-	&adc	("edx",0);
-	&mov	($j,1);
-	&mov	("eax",&DWP(4,$inp));			# np[1]
-
-	&jmp	(&label("3rdmadd"));
 }
 
 &set_label("common_tail",16);
diff --git a/crypto/fipsmodule/sha/asm/sha512-586.pl b/crypto/fipsmodule/sha/asm/sha512-586.pl
index b288776..67ad8a3 100644
--- a/crypto/fipsmodule/sha/asm/sha512-586.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-586.pl
@@ -315,9 +315,6 @@
 if ($sse2) {
 	&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
 	&mov	("ecx",&DWP(0,"edx"));
-	&test	("ecx",1<<26);
-	&jz	(&label("loop_x86"));
-
 	&mov	("edx",&DWP(4,"edx"));
 
 	# load ctx->h[0-7]
@@ -688,149 +685,6 @@
 }
 &function_end_A();
 }
-&set_label("loop_x86",16);
-    # copy input block to stack reversing byte and qword order
-    for ($i=0;$i<8;$i++) {
-	&mov	("eax",&DWP($i*16+0,"edi"));
-	&mov	("ebx",&DWP($i*16+4,"edi"));
-	&mov	("ecx",&DWP($i*16+8,"edi"));
-	&mov	("edx",&DWP($i*16+12,"edi"));
-	&bswap	("eax");
-	&bswap	("ebx");
-	&bswap	("ecx");
-	&bswap	("edx");
-	&push	("eax");
-	&push	("ebx");
-	&push	("ecx");
-	&push	("edx");
-    }
-	&add	("edi",128);
-	&sub	("esp",9*8);		# place for T,A,B,C,D,E,F,G,H
-	&mov	(&DWP(8*(9+16)+4,"esp"),"edi");
-
-	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
-	&lea	("edi",&DWP(8,"esp"));
-	&mov	("ecx",16);
-	&data_word(0xA5F3F689);		# rep movsd
-
-&set_label("00_15_x86",16);
-	&BODY_00_15_x86();
-
-	&cmp	(&LB("edx"),0x94);
-	&jne	(&label("00_15_x86"));
-
-&set_label("16_79_x86",16);
-	#define sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
-	#	LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
-	#	HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
-	&mov	("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
-	&mov	("edx",&DWP(8*(9+15+16-1)+4,"esp"));
-	&mov	("esi","ecx");
-
-	&shr	("ecx",1);	# lo>>1
-	&mov	("edi","edx");
-	&shr	("edx",1);	# hi>>1
-	&mov	("eax","ecx");
-	&shl	("esi",24);	# lo<<24
-	&mov	("ebx","edx");
-	&shl	("edi",24);	# hi<<24
-	&xor	("ebx","esi");
-
-	&shr	("ecx",7-1);	# lo>>7
-	&xor	("eax","edi");
-	&shr	("edx",7-1);	# hi>>7
-	&xor	("eax","ecx");
-	&shl	("esi",31-24);	# lo<<31
-	&xor	("ebx","edx");
-	&shl	("edi",25-24);	# hi<<25
-	&xor	("ebx","esi");
-
-	&shr	("ecx",8-7);	# lo>>8
-	&xor	("eax","edi");
-	&shr	("edx",8-7);	# hi>>8
-	&xor	("eax","ecx");
-	&shl	("edi",31-25);	# hi<<31
-	&xor	("ebx","edx");
-	&xor	("eax","edi");			# T1 = sigma0(X[-15])
-
-	&mov	(&DWP(0,"esp"),"eax");
-	&mov	(&DWP(4,"esp"),"ebx");		# put T1 away
-
-	#define sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-	#	LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
-	#	HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
-	&mov	("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
-	&mov	("edx",&DWP(8*(9+15+16-14)+4,"esp"));
-	&mov	("esi","ecx");
-
-	&shr	("ecx",6);	# lo>>6
-	&mov	("edi","edx");
-	&shr	("edx",6);	# hi>>6
-	&mov	("eax","ecx");
-	&shl	("esi",3);	# lo<<3
-	&mov	("ebx","edx");
-	&shl	("edi",3);	# hi<<3
-	&xor	("eax","esi");
-
-	&shr	("ecx",19-6);	# lo>>19
-	&xor	("ebx","edi");
-	&shr	("edx",19-6);	# hi>>19
-	&xor	("eax","ecx");
-	&shl	("esi",13-3);	# lo<<13
-	&xor	("ebx","edx");
-	&shl	("edi",13-3);	# hi<<13
-	&xor	("ebx","esi");
-
-	&shr	("ecx",29-19);	# lo>>29
-	&xor	("eax","edi");
-	&shr	("edx",29-19);	# hi>>29
-	&xor	("ebx","ecx");
-	&shl	("edi",26-13);	# hi<<26
-	&xor	("eax","edx");
-	&xor	("eax","edi");			# sigma1(X[-2])
-
-	&mov	("ecx",&DWP(8*(9+15+16)+0,"esp"));
-	&mov	("edx",&DWP(8*(9+15+16)+4,"esp"));
-	&add	("eax",&DWP(0,"esp"));
-	&adc	("ebx",&DWP(4,"esp"));		# T1 = sigma1(X[-2])+T1
-	&mov	("esi",&DWP(8*(9+15+16-9)+0,"esp"));
-	&mov	("edi",&DWP(8*(9+15+16-9)+4,"esp"));
-	&add	("eax","ecx");
-	&adc	("ebx","edx");			# T1 += X[-16]
-	&add	("eax","esi");
-	&adc	("ebx","edi");			# T1 += X[-7]
-	&mov	(&DWP(8*(9+15)+0,"esp"),"eax");
-	&mov	(&DWP(8*(9+15)+4,"esp"),"ebx");	# save X[0]
-
-	&BODY_00_15_x86();
-
-	&cmp	(&LB("edx"),0x17);
-	&jne	(&label("16_79_x86"));
-
-	&mov	("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
-	&mov	("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
-    for($i=0;$i<4;$i++) {
-	&mov	("eax",&DWP($i*16+0,"esi"));
-	&mov	("ebx",&DWP($i*16+4,"esi"));
-	&mov	("ecx",&DWP($i*16+8,"esi"));
-	&mov	("edx",&DWP($i*16+12,"esi"));
-	&add	("eax",&DWP(8+($i*16)+0,"esp"));
-	&adc	("ebx",&DWP(8+($i*16)+4,"esp"));
-	&mov	(&DWP($i*16+0,"esi"),"eax");
-	&mov	(&DWP($i*16+4,"esi"),"ebx");
-	&add	("ecx",&DWP(8+($i*16)+8,"esp"));
-	&adc	("edx",&DWP(8+($i*16)+12,"esp"));
-	&mov	(&DWP($i*16+8,"esi"),"ecx");
-	&mov	(&DWP($i*16+12,"esi"),"edx");
-    }
-	&add	("esp",8*(9+16+80));		# destroy frame
-	&sub	($K512,8*80);			# rewind K
-
-	&cmp	("edi",&DWP(8,"esp"));		# are we done yet?
-	&jb	(&label("loop_x86"));
-
-	&mov	("esp",&DWP(12,"esp"));		# restore sp
-&function_end_A();
 
 &set_label("K512",64);	# Yes! I keep it in the code segment!
 	&data_word(0xd728ae22,0x428a2f98);	# u64
diff --git a/gen/bcm/bn-586-apple.S b/gen/bcm/bn-586-apple.S
index 93513d0..f483ef1 100644
--- a/gen/bcm/bn-586-apple.S
+++ b/gen/bcm/bn-586-apple.S
@@ -10,20 +10,14 @@
 .align	4
 _bn_mul_add_words:
 L_bn_mul_add_words_begin:
-	call	L000PIC_me_up
-L000PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L001maw_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 	movd	16(%esp),%mm0
 	pxor	%mm1,%mm1
-	jmp	L002maw_sse2_entry
+	jmp	L000maw_sse2_entry
 .align	4,0x90
-L003maw_sse2_unrolled:
+L001maw_sse2_unrolled:
 	movd	(%eax),%mm3
 	paddq	%mm3,%mm1
 	movd	(%edx),%mm2
@@ -83,12 +77,12 @@
 	leal	32(%eax),%eax
 	psrlq	$32,%mm1
 	subl	$8,%ecx
-	jz	L004maw_sse2_exit
-L002maw_sse2_entry:
+	jz	L002maw_sse2_exit
+L000maw_sse2_entry:
 	testl	$4294967288,%ecx
-	jnz	L003maw_sse2_unrolled
+	jnz	L001maw_sse2_unrolled
 .align	2,0x90
-L005maw_sse2_loop:
+L003maw_sse2_loop:
 	movd	(%edx),%mm2
 	movd	(%eax),%mm3
 	pmuludq	%mm0,%mm2
@@ -99,189 +93,11 @@
 	subl	$1,%ecx
 	psrlq	$32,%mm1
 	leal	4(%eax),%eax
-	jnz	L005maw_sse2_loop
-L004maw_sse2_exit:
+	jnz	L003maw_sse2_loop
+L002maw_sse2_exit:
 	movd	%mm1,%eax
 	emms
 	ret
-.align	4,0x90
-L001maw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	28(%esp),%ecx
-	movl	24(%esp),%ebx
-	andl	$4294967288,%ecx
-	movl	32(%esp),%ebp
-	pushl	%ecx
-	jz	L006maw_finish
-.align	4,0x90
-L007maw_loop:
-	# Round 0 
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	# Round 4 
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	# Round 8 
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	# Round 12 
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	# Round 16 
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	# Round 20 
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	# Round 24 
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-	# Round 28 
-	movl	28(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	28(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	subl	$8,%ecx
-	leal	32(%ebx),%ebx
-	leal	32(%edi),%edi
-	jnz	L007maw_loop
-L006maw_finish:
-	movl	32(%esp),%ecx
-	andl	$7,%ecx
-	jnz	L008maw_finish2
-	jmp	L009maw_end
-L008maw_finish2:
-	# Tail Round 0 
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 1 
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 2 
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 3 
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 4 
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 5 
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 6 
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-L009maw_end:
-	movl	%esi,%eax
-	popl	%ecx
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -292,19 +108,13 @@
 .align	4
 _bn_mul_words:
 L_bn_mul_words_begin:
-	call	L010PIC_me_up
-L010PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L011mw_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 	movd	16(%esp),%mm0
 	pxor	%mm1,%mm1
 .align	4,0x90
-L012mw_sse2_loop:
+L004mw_sse2_loop:
 	movd	(%edx),%mm2
 	pmuludq	%mm0,%mm2
 	leal	4(%edx),%edx
@@ -313,156 +123,10 @@
 	subl	$1,%ecx
 	psrlq	$32,%mm1
 	leal	4(%eax),%eax
-	jnz	L012mw_sse2_loop
+	jnz	L004mw_sse2_loop
 	movd	%mm1,%eax
 	emms
 	ret
-.align	4,0x90
-L011mw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebx
-	movl	28(%esp),%ebp
-	movl	32(%esp),%ecx
-	andl	$4294967288,%ebp
-	jz	L013mw_finish
-L014mw_loop:
-	# Round 0 
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	# Round 4 
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	# Round 8 
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	# Round 12 
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	# Round 16 
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	# Round 20 
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	# Round 24 
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-	# Round 28 
-	movl	28(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	addl	$32,%ebx
-	addl	$32,%edi
-	subl	$8,%ebp
-	jz	L013mw_finish
-	jmp	L014mw_loop
-L013mw_finish:
-	movl	28(%esp),%ebp
-	andl	$7,%ebp
-	jnz	L015mw_finish2
-	jmp	L016mw_end
-L015mw_finish2:
-	# Tail Round 0 
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 1 
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 2 
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 3 
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 4 
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 5 
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 6 
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-L016mw_end:
-	movl	%esi,%eax
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -473,136 +137,20 @@
 .align	4
 _bn_sqr_words:
 L_bn_sqr_words_begin:
-	call	L017PIC_me_up
-L017PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L018sqr_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 .align	4,0x90
-L019sqr_sse2_loop:
+L005sqr_sse2_loop:
 	movd	(%edx),%mm0
 	pmuludq	%mm0,%mm0
 	leal	4(%edx),%edx
 	movq	%mm0,(%eax)
 	subl	$1,%ecx
 	leal	8(%eax),%eax
-	jnz	L019sqr_sse2_loop
+	jnz	L005sqr_sse2_loop
 	emms
 	ret
-.align	4,0x90
-L018sqr_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%ebx
-	andl	$4294967288,%ebx
-	jz	L020sw_finish
-L021sw_loop:
-	# Round 0 
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	movl	%edx,4(%esi)
-	# Round 4 
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	movl	%edx,12(%esi)
-	# Round 8 
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	movl	%edx,20(%esi)
-	# Round 12 
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	movl	%edx,28(%esi)
-	# Round 16 
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	movl	%edx,36(%esi)
-	# Round 20 
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	movl	%edx,44(%esi)
-	# Round 24 
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-	# Round 28 
-	movl	28(%edi),%eax
-	mull	%eax
-	movl	%eax,56(%esi)
-	movl	%edx,60(%esi)
-
-	addl	$32,%edi
-	addl	$64,%esi
-	subl	$8,%ebx
-	jnz	L021sw_loop
-L020sw_finish:
-	movl	28(%esp),%ebx
-	andl	$7,%ebx
-	jz	L022sw_end
-	# Tail Round 0 
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	decl	%ebx
-	movl	%edx,4(%esi)
-	jz	L022sw_end
-	# Tail Round 1 
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	decl	%ebx
-	movl	%edx,12(%esi)
-	jz	L022sw_end
-	# Tail Round 2 
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	decl	%ebx
-	movl	%edx,20(%esi)
-	jz	L022sw_end
-	# Tail Round 3 
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	decl	%ebx
-	movl	%edx,28(%esi)
-	jz	L022sw_end
-	# Tail Round 4 
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	decl	%ebx
-	movl	%edx,36(%esi)
-	jz	L022sw_end
-	# Tail Round 5 
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	decl	%ebx
-	movl	%edx,44(%esi)
-	jz	L022sw_end
-	# Tail Round 6 
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-L022sw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -634,8 +182,8 @@
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	L023aw_finish
-L024aw_loop:
+	jz	L006aw_finish
+L007aw_loop:
 	# Round 0 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -713,11 +261,11 @@
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	L024aw_loop
-L023aw_finish:
+	jnz	L007aw_loop
+L006aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	L025aw_end
+	jz	L008aw_end
 	# Tail Round 0 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -728,7 +276,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	L025aw_end
+	jz	L008aw_end
 	# Tail Round 1 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -739,7 +287,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	L025aw_end
+	jz	L008aw_end
 	# Tail Round 2 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -750,7 +298,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	L025aw_end
+	jz	L008aw_end
 	# Tail Round 3 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -761,7 +309,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	L025aw_end
+	jz	L008aw_end
 	# Tail Round 4 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -772,7 +320,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	L025aw_end
+	jz	L008aw_end
 	# Tail Round 5 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -783,7 +331,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	L025aw_end
+	jz	L008aw_end
 	# Tail Round 6 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -793,7 +341,7 @@
 	addl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-L025aw_end:
+L008aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -815,8 +363,8 @@
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	L026aw_finish
-L027aw_loop:
+	jz	L009aw_finish
+L010aw_loop:
 	# Round 0 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -894,11 +442,11 @@
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	L027aw_loop
-L026aw_finish:
+	jnz	L010aw_loop
+L009aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	L028aw_end
+	jz	L011aw_end
 	# Tail Round 0 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -909,7 +457,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	L028aw_end
+	jz	L011aw_end
 	# Tail Round 1 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -920,7 +468,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	L028aw_end
+	jz	L011aw_end
 	# Tail Round 2 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -931,7 +479,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	L028aw_end
+	jz	L011aw_end
 	# Tail Round 3 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -942,7 +490,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	L028aw_end
+	jz	L011aw_end
 	# Tail Round 4 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -953,7 +501,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	L028aw_end
+	jz	L011aw_end
 	# Tail Round 5 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -964,7 +512,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	L028aw_end
+	jz	L011aw_end
 	# Tail Round 6 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -974,14 +522,10 @@
 	subl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-L028aw_end:
+L011aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
 	popl	%ebp
 	ret
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/bn-586-linux.S b/gen/bcm/bn-586-linux.S
index 311f22c..fb83b22 100644
--- a/gen/bcm/bn-586-linux.S
+++ b/gen/bcm/bn-586-linux.S
@@ -11,20 +11,14 @@
 .align	16
 bn_mul_add_words:
 .L_bn_mul_add_words_begin:
-	call	.L000PIC_me_up
-.L000PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L001maw_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 	movd	16(%esp),%mm0
 	pxor	%mm1,%mm1
-	jmp	.L002maw_sse2_entry
+	jmp	.L000maw_sse2_entry
 .align	16
-.L003maw_sse2_unrolled:
+.L001maw_sse2_unrolled:
 	movd	(%eax),%mm3
 	paddq	%mm3,%mm1
 	movd	(%edx),%mm2
@@ -84,12 +78,12 @@
 	leal	32(%eax),%eax
 	psrlq	$32,%mm1
 	subl	$8,%ecx
-	jz	.L004maw_sse2_exit
-.L002maw_sse2_entry:
+	jz	.L002maw_sse2_exit
+.L000maw_sse2_entry:
 	testl	$4294967288,%ecx
-	jnz	.L003maw_sse2_unrolled
+	jnz	.L001maw_sse2_unrolled
 .align	4
-.L005maw_sse2_loop:
+.L003maw_sse2_loop:
 	movd	(%edx),%mm2
 	movd	(%eax),%mm3
 	pmuludq	%mm0,%mm2
@@ -100,189 +94,11 @@
 	subl	$1,%ecx
 	psrlq	$32,%mm1
 	leal	4(%eax),%eax
-	jnz	.L005maw_sse2_loop
-.L004maw_sse2_exit:
+	jnz	.L003maw_sse2_loop
+.L002maw_sse2_exit:
 	movd	%mm1,%eax
 	emms
 	ret
-.align	16
-.L001maw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	28(%esp),%ecx
-	movl	24(%esp),%ebx
-	andl	$4294967288,%ecx
-	movl	32(%esp),%ebp
-	pushl	%ecx
-	jz	.L006maw_finish
-.align	16
-.L007maw_loop:
-
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-
-	movl	28(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	28(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	subl	$8,%ecx
-	leal	32(%ebx),%ebx
-	leal	32(%edi),%edi
-	jnz	.L007maw_loop
-.L006maw_finish:
-	movl	32(%esp),%ecx
-	andl	$7,%ecx
-	jnz	.L008maw_finish2
-	jmp	.L009maw_end
-.L008maw_finish2:
-
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	jz	.L009maw_end
-
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-.L009maw_end:
-	movl	%esi,%eax
-	popl	%ecx
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -295,19 +111,13 @@
 .align	16
 bn_mul_words:
 .L_bn_mul_words_begin:
-	call	.L010PIC_me_up
-.L010PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L011mw_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 	movd	16(%esp),%mm0
 	pxor	%mm1,%mm1
 .align	16
-.L012mw_sse2_loop:
+.L004mw_sse2_loop:
 	movd	(%edx),%mm2
 	pmuludq	%mm0,%mm2
 	leal	4(%edx),%edx
@@ -316,156 +126,10 @@
 	subl	$1,%ecx
 	psrlq	$32,%mm1
 	leal	4(%eax),%eax
-	jnz	.L012mw_sse2_loop
+	jnz	.L004mw_sse2_loop
 	movd	%mm1,%eax
 	emms
 	ret
-.align	16
-.L011mw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebx
-	movl	28(%esp),%ebp
-	movl	32(%esp),%ecx
-	andl	$4294967288,%ebp
-	jz	.L013mw_finish
-.L014mw_loop:
-
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-
-	movl	28(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	addl	$32,%ebx
-	addl	$32,%edi
-	subl	$8,%ebp
-	jz	.L013mw_finish
-	jmp	.L014mw_loop
-.L013mw_finish:
-	movl	28(%esp),%ebp
-	andl	$7,%ebp
-	jnz	.L015mw_finish2
-	jmp	.L016mw_end
-.L015mw_finish2:
-
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	.L016mw_end
-
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-.L016mw_end:
-	movl	%esi,%eax
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -478,136 +142,20 @@
 .align	16
 bn_sqr_words:
 .L_bn_sqr_words_begin:
-	call	.L017PIC_me_up
-.L017PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L018sqr_non_sse2
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	12(%esp),%ecx
 .align	16
-.L019sqr_sse2_loop:
+.L005sqr_sse2_loop:
 	movd	(%edx),%mm0
 	pmuludq	%mm0,%mm0
 	leal	4(%edx),%edx
 	movq	%mm0,(%eax)
 	subl	$1,%ecx
 	leal	8(%eax),%eax
-	jnz	.L019sqr_sse2_loop
+	jnz	.L005sqr_sse2_loop
 	emms
 	ret
-.align	16
-.L018sqr_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%ebx
-	andl	$4294967288,%ebx
-	jz	.L020sw_finish
-.L021sw_loop:
-
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	movl	%edx,4(%esi)
-
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	movl	%edx,12(%esi)
-
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	movl	%edx,20(%esi)
-
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	movl	%edx,28(%esi)
-
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	movl	%edx,36(%esi)
-
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	movl	%edx,44(%esi)
-
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-
-	movl	28(%edi),%eax
-	mull	%eax
-	movl	%eax,56(%esi)
-	movl	%edx,60(%esi)
-
-	addl	$32,%edi
-	addl	$64,%esi
-	subl	$8,%ebx
-	jnz	.L021sw_loop
-.L020sw_finish:
-	movl	28(%esp),%ebx
-	andl	$7,%ebx
-	jz	.L022sw_end
-
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	decl	%ebx
-	movl	%edx,4(%esi)
-	jz	.L022sw_end
-
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	decl	%ebx
-	movl	%edx,12(%esi)
-	jz	.L022sw_end
-
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	decl	%ebx
-	movl	%edx,20(%esi)
-	jz	.L022sw_end
-
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	decl	%ebx
-	movl	%edx,28(%esi)
-	jz	.L022sw_end
-
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	decl	%ebx
-	movl	%edx,36(%esi)
-	jz	.L022sw_end
-
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	decl	%ebx
-	movl	%edx,44(%esi)
-	jz	.L022sw_end
-
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-.L022sw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -643,8 +191,8 @@
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L023aw_finish
-.L024aw_loop:
+	jz	.L006aw_finish
+.L007aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -722,11 +270,11 @@
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L024aw_loop
-.L023aw_finish:
+	jnz	.L007aw_loop
+.L006aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -737,7 +285,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -748,7 +296,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -759,7 +307,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -770,7 +318,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -781,7 +329,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -792,7 +340,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L025aw_end
+	jz	.L008aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -802,7 +350,7 @@
 	addl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L025aw_end:
+.L008aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -826,8 +374,8 @@
 	movl	32(%esp),%ebp
 	xorl	%eax,%eax
 	andl	$4294967288,%ebp
-	jz	.L026aw_finish
-.L027aw_loop:
+	jz	.L009aw_finish
+.L010aw_loop:
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -905,11 +453,11 @@
 	addl	$32,%edi
 	addl	$32,%ebx
 	subl	$8,%ebp
-	jnz	.L027aw_loop
-.L026aw_finish:
+	jnz	.L010aw_loop
+.L009aw_finish:
 	movl	32(%esp),%ebp
 	andl	$7,%ebp
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	(%esi),%ecx
 	movl	(%edi),%edx
@@ -920,7 +468,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	4(%esi),%ecx
 	movl	4(%edi),%edx
@@ -931,7 +479,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,4(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	8(%esi),%ecx
 	movl	8(%edi),%edx
@@ -942,7 +490,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,8(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	12(%esi),%ecx
 	movl	12(%edi),%edx
@@ -953,7 +501,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,12(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	16(%esi),%ecx
 	movl	16(%edi),%edx
@@ -964,7 +512,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,16(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	20(%esi),%ecx
 	movl	20(%edi),%edx
@@ -975,7 +523,7 @@
 	adcl	$0,%eax
 	decl	%ebp
 	movl	%ecx,20(%ebx)
-	jz	.L028aw_end
+	jz	.L011aw_end
 
 	movl	24(%esi),%ecx
 	movl	24(%edi),%edx
@@ -985,7 +533,7 @@
 	subl	%edx,%ecx
 	adcl	$0,%eax
 	movl	%ecx,24(%ebx)
-.L028aw_end:
+.L011aw_end:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
diff --git a/gen/bcm/bn-586-win.asm b/gen/bcm/bn-586-win.asm
index f7ddfa8..09aafb8 100644
--- a/gen/bcm/bn-586-win.asm
+++ b/gen/bcm/bn-586-win.asm
@@ -13,22 +13,18 @@
 %else
 section	.text	code
 %endif
-;extern	_OPENSSL_ia32cap_P
 global	_bn_mul_add_words
 align	16
 _bn_mul_add_words:
 L$_bn_mul_add_words_begin:
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$000maw_non_sse2
 	mov	eax,DWORD [4+esp]
 	mov	edx,DWORD [8+esp]
 	mov	ecx,DWORD [12+esp]
 	movd	mm0,DWORD [16+esp]
 	pxor	mm1,mm1
-	jmp	NEAR L$001maw_sse2_entry
+	jmp	NEAR L$000maw_sse2_entry
 align	16
-L$002maw_sse2_unrolled:
+L$001maw_sse2_unrolled:
 	movd	mm3,DWORD [eax]
 	paddq	mm1,mm3
 	movd	mm2,DWORD [edx]
@@ -88,12 +84,12 @@
 	lea	eax,[32+eax]
 	psrlq	mm1,32
 	sub	ecx,8
-	jz	NEAR L$003maw_sse2_exit
-L$001maw_sse2_entry:
+	jz	NEAR L$002maw_sse2_exit
+L$000maw_sse2_entry:
 	test	ecx,4294967288
-	jnz	NEAR L$002maw_sse2_unrolled
+	jnz	NEAR L$001maw_sse2_unrolled
 align	4
-L$004maw_sse2_loop:
+L$003maw_sse2_loop:
 	movd	mm2,DWORD [edx]
 	movd	mm3,DWORD [eax]
 	pmuludq	mm2,mm0
@@ -104,189 +100,11 @@
 	sub	ecx,1
 	psrlq	mm1,32
 	lea	eax,[4+eax]
-	jnz	NEAR L$004maw_sse2_loop
-L$003maw_sse2_exit:
+	jnz	NEAR L$003maw_sse2_loop
+L$002maw_sse2_exit:
 	movd	eax,mm1
 	emms
 	ret
-align	16
-L$000maw_non_sse2:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	xor	esi,esi
-	mov	edi,DWORD [20+esp]
-	mov	ecx,DWORD [28+esp]
-	mov	ebx,DWORD [24+esp]
-	and	ecx,4294967288
-	mov	ebp,DWORD [32+esp]
-	push	ecx
-	jz	NEAR L$005maw_finish
-align	16
-L$006maw_loop:
-	; Round 0
-	mov	eax,DWORD [ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [edi]
-	adc	edx,0
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	; Round 4
-	mov	eax,DWORD [4+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [4+edi]
-	adc	edx,0
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	; Round 8
-	mov	eax,DWORD [8+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [8+edi]
-	adc	edx,0
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	; Round 12
-	mov	eax,DWORD [12+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [12+edi]
-	adc	edx,0
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	; Round 16
-	mov	eax,DWORD [16+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [16+edi]
-	adc	edx,0
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	; Round 20
-	mov	eax,DWORD [20+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [20+edi]
-	adc	edx,0
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	; Round 24
-	mov	eax,DWORD [24+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [24+edi]
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-	; Round 28
-	mov	eax,DWORD [28+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [28+edi]
-	adc	edx,0
-	mov	DWORD [28+edi],eax
-	mov	esi,edx
-	; 
-	sub	ecx,8
-	lea	ebx,[32+ebx]
-	lea	edi,[32+edi]
-	jnz	NEAR L$006maw_loop
-L$005maw_finish:
-	mov	ecx,DWORD [32+esp]
-	and	ecx,7
-	jnz	NEAR L$007maw_finish2
-	jmp	NEAR L$008maw_end
-L$007maw_finish2:
-	; Tail Round 0
-	mov	eax,DWORD [ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 1
-	mov	eax,DWORD [4+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [4+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 2
-	mov	eax,DWORD [8+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [8+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 3
-	mov	eax,DWORD [12+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [12+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 4
-	mov	eax,DWORD [16+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [16+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 5
-	mov	eax,DWORD [20+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [20+edi]
-	adc	edx,0
-	dec	ecx
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	jz	NEAR L$008maw_end
-	; Tail Round 6
-	mov	eax,DWORD [24+ebx]
-	mul	ebp
-	add	eax,esi
-	adc	edx,0
-	add	eax,DWORD [24+edi]
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-L$008maw_end:
-	mov	eax,esi
-	pop	ecx
 	pop	edi
 	pop	esi
 	pop	ebx
@@ -296,16 +114,13 @@
 align	16
 _bn_mul_words:
 L$_bn_mul_words_begin:
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$009mw_non_sse2
 	mov	eax,DWORD [4+esp]
 	mov	edx,DWORD [8+esp]
 	mov	ecx,DWORD [12+esp]
 	movd	mm0,DWORD [16+esp]
 	pxor	mm1,mm1
 align	16
-L$010mw_sse2_loop:
+L$004mw_sse2_loop:
 	movd	mm2,DWORD [edx]
 	pmuludq	mm2,mm0
 	lea	edx,[4+edx]
@@ -314,156 +129,10 @@
 	sub	ecx,1
 	psrlq	mm1,32
 	lea	eax,[4+eax]
-	jnz	NEAR L$010mw_sse2_loop
+	jnz	NEAR L$004mw_sse2_loop
 	movd	eax,mm1
 	emms
 	ret
-align	16
-L$009mw_non_sse2:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	xor	esi,esi
-	mov	edi,DWORD [20+esp]
-	mov	ebx,DWORD [24+esp]
-	mov	ebp,DWORD [28+esp]
-	mov	ecx,DWORD [32+esp]
-	and	ebp,4294967288
-	jz	NEAR L$011mw_finish
-L$012mw_loop:
-	; Round 0
-	mov	eax,DWORD [ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	; Round 4
-	mov	eax,DWORD [4+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	; Round 8
-	mov	eax,DWORD [8+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	; Round 12
-	mov	eax,DWORD [12+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	; Round 16
-	mov	eax,DWORD [16+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	; Round 20
-	mov	eax,DWORD [20+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	; Round 24
-	mov	eax,DWORD [24+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-	; Round 28
-	mov	eax,DWORD [28+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [28+edi],eax
-	mov	esi,edx
-	; 
-	add	ebx,32
-	add	edi,32
-	sub	ebp,8
-	jz	NEAR L$011mw_finish
-	jmp	NEAR L$012mw_loop
-L$011mw_finish:
-	mov	ebp,DWORD [28+esp]
-	and	ebp,7
-	jnz	NEAR L$013mw_finish2
-	jmp	NEAR L$014mw_end
-L$013mw_finish2:
-	; Tail Round 0
-	mov	eax,DWORD [ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 1
-	mov	eax,DWORD [4+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [4+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 2
-	mov	eax,DWORD [8+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [8+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 3
-	mov	eax,DWORD [12+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [12+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 4
-	mov	eax,DWORD [16+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [16+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 5
-	mov	eax,DWORD [20+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [20+edi],eax
-	mov	esi,edx
-	dec	ebp
-	jz	NEAR L$014mw_end
-	; Tail Round 6
-	mov	eax,DWORD [24+ebx]
-	mul	ecx
-	add	eax,esi
-	adc	edx,0
-	mov	DWORD [24+edi],eax
-	mov	esi,edx
-L$014mw_end:
-	mov	eax,esi
 	pop	edi
 	pop	esi
 	pop	ebx
@@ -473,133 +142,20 @@
 align	16
 _bn_sqr_words:
 L$_bn_sqr_words_begin:
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$015sqr_non_sse2
 	mov	eax,DWORD [4+esp]
 	mov	edx,DWORD [8+esp]
 	mov	ecx,DWORD [12+esp]
 align	16
-L$016sqr_sse2_loop:
+L$005sqr_sse2_loop:
 	movd	mm0,DWORD [edx]
 	pmuludq	mm0,mm0
 	lea	edx,[4+edx]
 	movq	[eax],mm0
 	sub	ecx,1
 	lea	eax,[8+eax]
-	jnz	NEAR L$016sqr_sse2_loop
+	jnz	NEAR L$005sqr_sse2_loop
 	emms
 	ret
-align	16
-L$015sqr_non_sse2:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	; 
-	mov	esi,DWORD [20+esp]
-	mov	edi,DWORD [24+esp]
-	mov	ebx,DWORD [28+esp]
-	and	ebx,4294967288
-	jz	NEAR L$017sw_finish
-L$018sw_loop:
-	; Round 0
-	mov	eax,DWORD [edi]
-	mul	eax
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],edx
-	; Round 4
-	mov	eax,DWORD [4+edi]
-	mul	eax
-	mov	DWORD [8+esi],eax
-	mov	DWORD [12+esi],edx
-	; Round 8
-	mov	eax,DWORD [8+edi]
-	mul	eax
-	mov	DWORD [16+esi],eax
-	mov	DWORD [20+esi],edx
-	; Round 12
-	mov	eax,DWORD [12+edi]
-	mul	eax
-	mov	DWORD [24+esi],eax
-	mov	DWORD [28+esi],edx
-	; Round 16
-	mov	eax,DWORD [16+edi]
-	mul	eax
-	mov	DWORD [32+esi],eax
-	mov	DWORD [36+esi],edx
-	; Round 20
-	mov	eax,DWORD [20+edi]
-	mul	eax
-	mov	DWORD [40+esi],eax
-	mov	DWORD [44+esi],edx
-	; Round 24
-	mov	eax,DWORD [24+edi]
-	mul	eax
-	mov	DWORD [48+esi],eax
-	mov	DWORD [52+esi],edx
-	; Round 28
-	mov	eax,DWORD [28+edi]
-	mul	eax
-	mov	DWORD [56+esi],eax
-	mov	DWORD [60+esi],edx
-	; 
-	add	edi,32
-	add	esi,64
-	sub	ebx,8
-	jnz	NEAR L$018sw_loop
-L$017sw_finish:
-	mov	ebx,DWORD [28+esp]
-	and	ebx,7
-	jz	NEAR L$019sw_end
-	; Tail Round 0
-	mov	eax,DWORD [edi]
-	mul	eax
-	mov	DWORD [esi],eax
-	dec	ebx
-	mov	DWORD [4+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 1
-	mov	eax,DWORD [4+edi]
-	mul	eax
-	mov	DWORD [8+esi],eax
-	dec	ebx
-	mov	DWORD [12+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 2
-	mov	eax,DWORD [8+edi]
-	mul	eax
-	mov	DWORD [16+esi],eax
-	dec	ebx
-	mov	DWORD [20+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 3
-	mov	eax,DWORD [12+edi]
-	mul	eax
-	mov	DWORD [24+esi],eax
-	dec	ebx
-	mov	DWORD [28+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 4
-	mov	eax,DWORD [16+edi]
-	mul	eax
-	mov	DWORD [32+esi],eax
-	dec	ebx
-	mov	DWORD [36+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 5
-	mov	eax,DWORD [20+edi]
-	mul	eax
-	mov	DWORD [40+esi],eax
-	dec	ebx
-	mov	DWORD [44+esi],edx
-	jz	NEAR L$019sw_end
-	; Tail Round 6
-	mov	eax,DWORD [24+edi]
-	mul	eax
-	mov	DWORD [48+esi],eax
-	mov	DWORD [52+esi],edx
-L$019sw_end:
 	pop	edi
 	pop	esi
 	pop	ebx
@@ -629,8 +185,8 @@
 	mov	ebp,DWORD [32+esp]
 	xor	eax,eax
 	and	ebp,4294967288
-	jz	NEAR L$020aw_finish
-L$021aw_loop:
+	jz	NEAR L$006aw_finish
+L$007aw_loop:
 	; Round 0
 	mov	ecx,DWORD [esi]
 	mov	edx,DWORD [edi]
@@ -708,11 +264,11 @@
 	add	edi,32
 	add	ebx,32
 	sub	ebp,8
-	jnz	NEAR L$021aw_loop
-L$020aw_finish:
+	jnz	NEAR L$007aw_loop
+L$006aw_finish:
 	mov	ebp,DWORD [32+esp]
 	and	ebp,7
-	jz	NEAR L$022aw_end
+	jz	NEAR L$008aw_end
 	; Tail Round 0
 	mov	ecx,DWORD [esi]
 	mov	edx,DWORD [edi]
@@ -723,7 +279,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [ebx],ecx
-	jz	NEAR L$022aw_end
+	jz	NEAR L$008aw_end
 	; Tail Round 1
 	mov	ecx,DWORD [4+esi]
 	mov	edx,DWORD [4+edi]
@@ -734,7 +290,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [4+ebx],ecx
-	jz	NEAR L$022aw_end
+	jz	NEAR L$008aw_end
 	; Tail Round 2
 	mov	ecx,DWORD [8+esi]
 	mov	edx,DWORD [8+edi]
@@ -745,7 +301,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [8+ebx],ecx
-	jz	NEAR L$022aw_end
+	jz	NEAR L$008aw_end
 	; Tail Round 3
 	mov	ecx,DWORD [12+esi]
 	mov	edx,DWORD [12+edi]
@@ -756,7 +312,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [12+ebx],ecx
-	jz	NEAR L$022aw_end
+	jz	NEAR L$008aw_end
 	; Tail Round 4
 	mov	ecx,DWORD [16+esi]
 	mov	edx,DWORD [16+edi]
@@ -767,7 +323,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [16+ebx],ecx
-	jz	NEAR L$022aw_end
+	jz	NEAR L$008aw_end
 	; Tail Round 5
 	mov	ecx,DWORD [20+esi]
 	mov	edx,DWORD [20+edi]
@@ -778,7 +334,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [20+ebx],ecx
-	jz	NEAR L$022aw_end
+	jz	NEAR L$008aw_end
 	; Tail Round 6
 	mov	ecx,DWORD [24+esi]
 	mov	edx,DWORD [24+edi]
@@ -788,7 +344,7 @@
 	add	ecx,edx
 	adc	eax,0
 	mov	DWORD [24+ebx],ecx
-L$022aw_end:
+L$008aw_end:
 	pop	edi
 	pop	esi
 	pop	ebx
@@ -809,8 +365,8 @@
 	mov	ebp,DWORD [32+esp]
 	xor	eax,eax
 	and	ebp,4294967288
-	jz	NEAR L$023aw_finish
-L$024aw_loop:
+	jz	NEAR L$009aw_finish
+L$010aw_loop:
 	; Round 0
 	mov	ecx,DWORD [esi]
 	mov	edx,DWORD [edi]
@@ -888,11 +444,11 @@
 	add	edi,32
 	add	ebx,32
 	sub	ebp,8
-	jnz	NEAR L$024aw_loop
-L$023aw_finish:
+	jnz	NEAR L$010aw_loop
+L$009aw_finish:
 	mov	ebp,DWORD [32+esp]
 	and	ebp,7
-	jz	NEAR L$025aw_end
+	jz	NEAR L$011aw_end
 	; Tail Round 0
 	mov	ecx,DWORD [esi]
 	mov	edx,DWORD [edi]
@@ -903,7 +459,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [ebx],ecx
-	jz	NEAR L$025aw_end
+	jz	NEAR L$011aw_end
 	; Tail Round 1
 	mov	ecx,DWORD [4+esi]
 	mov	edx,DWORD [4+edi]
@@ -914,7 +470,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [4+ebx],ecx
-	jz	NEAR L$025aw_end
+	jz	NEAR L$011aw_end
 	; Tail Round 2
 	mov	ecx,DWORD [8+esi]
 	mov	edx,DWORD [8+edi]
@@ -925,7 +481,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [8+ebx],ecx
-	jz	NEAR L$025aw_end
+	jz	NEAR L$011aw_end
 	; Tail Round 3
 	mov	ecx,DWORD [12+esi]
 	mov	edx,DWORD [12+edi]
@@ -936,7 +492,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [12+ebx],ecx
-	jz	NEAR L$025aw_end
+	jz	NEAR L$011aw_end
 	; Tail Round 4
 	mov	ecx,DWORD [16+esi]
 	mov	edx,DWORD [16+edi]
@@ -947,7 +503,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [16+ebx],ecx
-	jz	NEAR L$025aw_end
+	jz	NEAR L$011aw_end
 	; Tail Round 5
 	mov	ecx,DWORD [20+esi]
 	mov	edx,DWORD [20+edi]
@@ -958,7 +514,7 @@
 	adc	eax,0
 	dec	ebp
 	mov	DWORD [20+ebx],ecx
-	jz	NEAR L$025aw_end
+	jz	NEAR L$011aw_end
 	; Tail Round 6
 	mov	ecx,DWORD [24+esi]
 	mov	edx,DWORD [24+edi]
@@ -968,14 +524,12 @@
 	sub	ecx,edx
 	adc	eax,0
 	mov	DWORD [24+ebx],ecx
-L$025aw_end:
+L$011aw_end:
 	pop	edi
 	pop	esi
 	pop	ebx
 	pop	ebp
 	ret
-segment	.bss
-common	_OPENSSL_ia32cap_P 16
 %else
 ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
 ret
diff --git a/gen/bcm/sha512-586-apple.S b/gen/bcm/sha512-586-apple.S
index cfdeac1..d4d05cb 100644
--- a/gen/bcm/sha512-586-apple.S
+++ b/gen/bcm/sha512-586-apple.S
@@ -32,8 +32,6 @@
 	movl	%ebx,12(%esp)
 	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx
 	movl	(%edx),%ecx
-	testl	$67108864,%ecx
-	jz	L002loop_x86
 	movl	4(%edx),%edx
 	movq	(%esi),%mm0
 	andl	$16777216,%ecx
@@ -47,11 +45,11 @@
 	movq	48(%esi),%mm6
 	movq	56(%esi),%mm7
 	cmpl	$16777728,%ecx
-	je	L003SSSE3
+	je	L002SSSE3
 	subl	$80,%esp
-	jmp	L004loop_sse2
+	jmp	L003loop_sse2
 .align	4,0x90
-L004loop_sse2:
+L003loop_sse2:
 	movq	%mm1,8(%esp)
 	movq	%mm2,16(%esp)
 	movq	%mm3,24(%esp)
@@ -66,9 +64,9 @@
 	movl	$15,%edx
 	bswap	%eax
 	bswap	%ebx
-	jmp	L00500_14_sse2
+	jmp	L00400_14_sse2
 .align	4,0x90
-L00500_14_sse2:
+L00400_14_sse2:
 	movd	%eax,%mm1
 	movl	(%edi),%eax
 	movd	%ebx,%mm7
@@ -129,7 +127,7 @@
 	paddq	%mm6,%mm3
 	movq	48(%esp),%mm6
 	decl	%edx
-	jnz	L00500_14_sse2
+	jnz	L00400_14_sse2
 	movd	%eax,%mm1
 	movd	%ebx,%mm7
 	punpckldq	%mm1,%mm7
@@ -185,9 +183,9 @@
 	paddq	%mm6,%mm3
 	pxor	%mm0,%mm0
 	movl	$32,%edx
-	jmp	L00616_79_sse2
+	jmp	L00516_79_sse2
 .align	4,0x90
-L00616_79_sse2:
+L00516_79_sse2:
 	movq	88(%esp),%mm5
 	movq	%mm7,%mm1
 	psrlq	$1,%mm7
@@ -341,7 +339,7 @@
 	paddq	%mm6,%mm0
 	addl	$8,%ebp
 	decl	%edx
-	jnz	L00616_79_sse2
+	jnz	L00516_79_sse2
 	paddq	%mm3,%mm0
 	movq	8(%esp),%mm1
 	movq	24(%esp),%mm3
@@ -369,7 +367,7 @@
 	leal	(%esp,%eax,1),%esp
 	subl	%eax,%ebp
 	cmpl	88(%esp),%edi
-	jb	L004loop_sse2
+	jb	L003loop_sse2
 	movl	92(%esp),%esp
 	emms
 	popl	%edi
@@ -378,7 +376,7 @@
 	popl	%ebp
 	ret
 .align	5,0x90
-L003SSSE3:
+L002SSSE3:
 	leal	-64(%esp),%edx
 	subl	$256,%esp
 	movdqa	640(%ebp),%xmm1
@@ -435,7 +433,7 @@
 	movdqa	%xmm2,-16(%edx)
 	nop
 .align	5,0x90
-L007loop_ssse3:
+L006loop_ssse3:
 	movdqa	16(%edx),%xmm2
 	movdqa	%xmm3,48(%edx)
 	leal	128(%ebp),%ebp
@@ -452,9 +450,9 @@
 	pxor	%mm1,%mm2
 	movq	%mm7,56(%esp)
 	pxor	%mm3,%mm3
-	jmp	L00800_47_ssse3
+	jmp	L00700_47_ssse3
 .align	5,0x90
-L00800_47_ssse3:
+L00700_47_ssse3:
 	movdqa	%xmm5,%xmm3
 	movdqa	%xmm2,%xmm1
 .byte	102,15,58,15,208,8
@@ -1473,7 +1471,7 @@
 	movdqa	%xmm1,-16(%edx)
 	leal	128(%ebp),%ebp
 	decl	%ecx
-	jnz	L00800_47_ssse3
+	jnz	L00700_47_ssse3
 	movdqa	(%ebp),%xmm1
 	leal	-640(%ebp),%ebp
 	movdqu	(%ebx),%xmm0
@@ -2285,7 +2283,7 @@
 	movq	%mm6,48(%esi)
 	movq	%mm7,56(%esi)
 	cmpl	%eax,%edi
-	jb	L007loop_ssse3
+	jb	L006loop_ssse3
 	movl	76(%edx),%esp
 	emms
 	popl	%edi
@@ -2293,454 +2291,6 @@
 	popl	%ebx
 	popl	%ebp
 	ret
-.align	4,0x90
-L002loop_x86:
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	16(%edi),%eax
-	movl	20(%edi),%ebx
-	movl	24(%edi),%ecx
-	movl	28(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	32(%edi),%eax
-	movl	36(%edi),%ebx
-	movl	40(%edi),%ecx
-	movl	44(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	48(%edi),%eax
-	movl	52(%edi),%ebx
-	movl	56(%edi),%ecx
-	movl	60(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	64(%edi),%eax
-	movl	68(%edi),%ebx
-	movl	72(%edi),%ecx
-	movl	76(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	80(%edi),%eax
-	movl	84(%edi),%ebx
-	movl	88(%edi),%ecx
-	movl	92(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	96(%edi),%eax
-	movl	100(%edi),%ebx
-	movl	104(%edi),%ecx
-	movl	108(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	112(%edi),%eax
-	movl	116(%edi),%ebx
-	movl	120(%edi),%ecx
-	movl	124(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	addl	$128,%edi
-	subl	$72,%esp
-	movl	%edi,204(%esp)
-	leal	8(%esp),%edi
-	movl	$16,%ecx
-.long	2784229001
-.align	4,0x90
-L00900_15_x86:
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$148,%dl
-	jne	L00900_15_x86
-.align	4,0x90
-L01016_79_x86:
-	movl	312(%esp),%ecx
-	movl	316(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$1,%ecx
-	movl	%edx,%edi
-	shrl	$1,%edx
-	movl	%ecx,%eax
-	shll	$24,%esi
-	movl	%edx,%ebx
-	shll	$24,%edi
-	xorl	%esi,%ebx
-	shrl	$6,%ecx
-	xorl	%edi,%eax
-	shrl	$6,%edx
-	xorl	%ecx,%eax
-	shll	$7,%esi
-	xorl	%edx,%ebx
-	shll	$1,%edi
-	xorl	%esi,%ebx
-	shrl	$1,%ecx
-	xorl	%edi,%eax
-	shrl	$1,%edx
-	xorl	%ecx,%eax
-	shll	$6,%edi
-	xorl	%edx,%ebx
-	xorl	%edi,%eax
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movl	208(%esp),%ecx
-	movl	212(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$6,%ecx
-	movl	%edx,%edi
-	shrl	$6,%edx
-	movl	%ecx,%eax
-	shll	$3,%esi
-	movl	%edx,%ebx
-	shll	$3,%edi
-	xorl	%esi,%eax
-	shrl	$13,%ecx
-	xorl	%edi,%ebx
-	shrl	$13,%edx
-	xorl	%ecx,%eax
-	shll	$10,%esi
-	xorl	%edx,%ebx
-	shll	$10,%edi
-	xorl	%esi,%ebx
-	shrl	$10,%ecx
-	xorl	%edi,%eax
-	shrl	$10,%edx
-	xorl	%ecx,%ebx
-	shll	$13,%edi
-	xorl	%edx,%eax
-	xorl	%edi,%eax
-	movl	320(%esp),%ecx
-	movl	324(%esp),%edx
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	movl	248(%esp),%esi
-	movl	252(%esp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,192(%esp)
-	movl	%ebx,196(%esp)
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$23,%dl
-	jne	L01016_79_x86
-	movl	840(%esp),%esi
-	movl	844(%esp),%edi
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
-	addl	8(%esp),%eax
-	adcl	12(%esp),%ebx
-	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	addl	16(%esp),%ecx
-	adcl	20(%esp),%edx
-	movl	%ecx,8(%esi)
-	movl	%edx,12(%esi)
-	movl	16(%esi),%eax
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%edx
-	addl	24(%esp),%eax
-	adcl	28(%esp),%ebx
-	movl	%eax,16(%esi)
-	movl	%ebx,20(%esi)
-	addl	32(%esp),%ecx
-	adcl	36(%esp),%edx
-	movl	%ecx,24(%esi)
-	movl	%edx,28(%esi)
-	movl	32(%esi),%eax
-	movl	36(%esi),%ebx
-	movl	40(%esi),%ecx
-	movl	44(%esi),%edx
-	addl	40(%esp),%eax
-	adcl	44(%esp),%ebx
-	movl	%eax,32(%esi)
-	movl	%ebx,36(%esi)
-	addl	48(%esp),%ecx
-	adcl	52(%esp),%edx
-	movl	%ecx,40(%esi)
-	movl	%edx,44(%esi)
-	movl	48(%esi),%eax
-	movl	52(%esi),%ebx
-	movl	56(%esi),%ecx
-	movl	60(%esi),%edx
-	addl	56(%esp),%eax
-	adcl	60(%esp),%ebx
-	movl	%eax,48(%esi)
-	movl	%ebx,52(%esi)
-	addl	64(%esp),%ecx
-	adcl	68(%esp),%edx
-	movl	%ecx,56(%esi)
-	movl	%edx,60(%esi)
-	addl	$840,%esp
-	subl	$640,%ebp
-	cmpl	8(%esp),%edi
-	jb	L002loop_x86
-	movl	12(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
 .align	6,0x90
 L001K512:
 .long	3609767458,1116352408
diff --git a/gen/bcm/sha512-586-linux.S b/gen/bcm/sha512-586-linux.S
index bb2884d..3dc0ecb 100644
--- a/gen/bcm/sha512-586-linux.S
+++ b/gen/bcm/sha512-586-linux.S
@@ -33,8 +33,6 @@
 	movl	%ebx,12(%esp)
 	leal	OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
 	movl	(%edx),%ecx
-	testl	$67108864,%ecx
-	jz	.L002loop_x86
 	movl	4(%edx),%edx
 	movq	(%esi),%mm0
 	andl	$16777216,%ecx
@@ -48,11 +46,11 @@
 	movq	48(%esi),%mm6
 	movq	56(%esi),%mm7
 	cmpl	$16777728,%ecx
-	je	.L003SSSE3
+	je	.L002SSSE3
 	subl	$80,%esp
-	jmp	.L004loop_sse2
+	jmp	.L003loop_sse2
 .align	16
-.L004loop_sse2:
+.L003loop_sse2:
 	movq	%mm1,8(%esp)
 	movq	%mm2,16(%esp)
 	movq	%mm3,24(%esp)
@@ -67,9 +65,9 @@
 	movl	$15,%edx
 	bswap	%eax
 	bswap	%ebx
-	jmp	.L00500_14_sse2
+	jmp	.L00400_14_sse2
 .align	16
-.L00500_14_sse2:
+.L00400_14_sse2:
 	movd	%eax,%mm1
 	movl	(%edi),%eax
 	movd	%ebx,%mm7
@@ -130,7 +128,7 @@
 	paddq	%mm6,%mm3
 	movq	48(%esp),%mm6
 	decl	%edx
-	jnz	.L00500_14_sse2
+	jnz	.L00400_14_sse2
 	movd	%eax,%mm1
 	movd	%ebx,%mm7
 	punpckldq	%mm1,%mm7
@@ -186,9 +184,9 @@
 	paddq	%mm6,%mm3
 	pxor	%mm0,%mm0
 	movl	$32,%edx
-	jmp	.L00616_79_sse2
+	jmp	.L00516_79_sse2
 .align	16
-.L00616_79_sse2:
+.L00516_79_sse2:
 	movq	88(%esp),%mm5
 	movq	%mm7,%mm1
 	psrlq	$1,%mm7
@@ -342,7 +340,7 @@
 	paddq	%mm6,%mm0
 	addl	$8,%ebp
 	decl	%edx
-	jnz	.L00616_79_sse2
+	jnz	.L00516_79_sse2
 	paddq	%mm3,%mm0
 	movq	8(%esp),%mm1
 	movq	24(%esp),%mm3
@@ -370,7 +368,7 @@
 	leal	(%esp,%eax,1),%esp
 	subl	%eax,%ebp
 	cmpl	88(%esp),%edi
-	jb	.L004loop_sse2
+	jb	.L003loop_sse2
 	movl	92(%esp),%esp
 	emms
 	popl	%edi
@@ -379,7 +377,7 @@
 	popl	%ebp
 	ret
 .align	32
-.L003SSSE3:
+.L002SSSE3:
 	leal	-64(%esp),%edx
 	subl	$256,%esp
 	movdqa	640(%ebp),%xmm1
@@ -436,7 +434,7 @@
 	movdqa	%xmm2,-16(%edx)
 	nop
 .align	32
-.L007loop_ssse3:
+.L006loop_ssse3:
 	movdqa	16(%edx),%xmm2
 	movdqa	%xmm3,48(%edx)
 	leal	128(%ebp),%ebp
@@ -453,9 +451,9 @@
 	pxor	%mm1,%mm2
 	movq	%mm7,56(%esp)
 	pxor	%mm3,%mm3
-	jmp	.L00800_47_ssse3
+	jmp	.L00700_47_ssse3
 .align	32
-.L00800_47_ssse3:
+.L00700_47_ssse3:
 	movdqa	%xmm5,%xmm3
 	movdqa	%xmm2,%xmm1
 .byte	102,15,58,15,208,8
@@ -1474,7 +1472,7 @@
 	movdqa	%xmm1,-16(%edx)
 	leal	128(%ebp),%ebp
 	decl	%ecx
-	jnz	.L00800_47_ssse3
+	jnz	.L00700_47_ssse3
 	movdqa	(%ebp),%xmm1
 	leal	-640(%ebp),%ebp
 	movdqu	(%ebx),%xmm0
@@ -2286,7 +2284,7 @@
 	movq	%mm6,48(%esi)
 	movq	%mm7,56(%esi)
 	cmpl	%eax,%edi
-	jb	.L007loop_ssse3
+	jb	.L006loop_ssse3
 	movl	76(%edx),%esp
 	emms
 	popl	%edi
@@ -2294,454 +2292,6 @@
 	popl	%ebx
 	popl	%ebp
 	ret
-.align	16
-.L002loop_x86:
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	16(%edi),%eax
-	movl	20(%edi),%ebx
-	movl	24(%edi),%ecx
-	movl	28(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	32(%edi),%eax
-	movl	36(%edi),%ebx
-	movl	40(%edi),%ecx
-	movl	44(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	48(%edi),%eax
-	movl	52(%edi),%ebx
-	movl	56(%edi),%ecx
-	movl	60(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	64(%edi),%eax
-	movl	68(%edi),%ebx
-	movl	72(%edi),%ecx
-	movl	76(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	80(%edi),%eax
-	movl	84(%edi),%ebx
-	movl	88(%edi),%ecx
-	movl	92(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	96(%edi),%eax
-	movl	100(%edi),%ebx
-	movl	104(%edi),%ecx
-	movl	108(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	112(%edi),%eax
-	movl	116(%edi),%ebx
-	movl	120(%edi),%ecx
-	movl	124(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	addl	$128,%edi
-	subl	$72,%esp
-	movl	%edi,204(%esp)
-	leal	8(%esp),%edi
-	movl	$16,%ecx
-.long	2784229001
-.align	16
-.L00900_15_x86:
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$148,%dl
-	jne	.L00900_15_x86
-.align	16
-.L01016_79_x86:
-	movl	312(%esp),%ecx
-	movl	316(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$1,%ecx
-	movl	%edx,%edi
-	shrl	$1,%edx
-	movl	%ecx,%eax
-	shll	$24,%esi
-	movl	%edx,%ebx
-	shll	$24,%edi
-	xorl	%esi,%ebx
-	shrl	$6,%ecx
-	xorl	%edi,%eax
-	shrl	$6,%edx
-	xorl	%ecx,%eax
-	shll	$7,%esi
-	xorl	%edx,%ebx
-	shll	$1,%edi
-	xorl	%esi,%ebx
-	shrl	$1,%ecx
-	xorl	%edi,%eax
-	shrl	$1,%edx
-	xorl	%ecx,%eax
-	shll	$6,%edi
-	xorl	%edx,%ebx
-	xorl	%edi,%eax
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movl	208(%esp),%ecx
-	movl	212(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$6,%ecx
-	movl	%edx,%edi
-	shrl	$6,%edx
-	movl	%ecx,%eax
-	shll	$3,%esi
-	movl	%edx,%ebx
-	shll	$3,%edi
-	xorl	%esi,%eax
-	shrl	$13,%ecx
-	xorl	%edi,%ebx
-	shrl	$13,%edx
-	xorl	%ecx,%eax
-	shll	$10,%esi
-	xorl	%edx,%ebx
-	shll	$10,%edi
-	xorl	%esi,%ebx
-	shrl	$10,%ecx
-	xorl	%edi,%eax
-	shrl	$10,%edx
-	xorl	%ecx,%ebx
-	shll	$13,%edi
-	xorl	%edx,%eax
-	xorl	%edi,%eax
-	movl	320(%esp),%ecx
-	movl	324(%esp),%edx
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	movl	248(%esp),%esi
-	movl	252(%esp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,192(%esp)
-	movl	%ebx,196(%esp)
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$23,%dl
-	jne	.L01016_79_x86
-	movl	840(%esp),%esi
-	movl	844(%esp),%edi
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
-	addl	8(%esp),%eax
-	adcl	12(%esp),%ebx
-	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	addl	16(%esp),%ecx
-	adcl	20(%esp),%edx
-	movl	%ecx,8(%esi)
-	movl	%edx,12(%esi)
-	movl	16(%esi),%eax
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%edx
-	addl	24(%esp),%eax
-	adcl	28(%esp),%ebx
-	movl	%eax,16(%esi)
-	movl	%ebx,20(%esi)
-	addl	32(%esp),%ecx
-	adcl	36(%esp),%edx
-	movl	%ecx,24(%esi)
-	movl	%edx,28(%esi)
-	movl	32(%esi),%eax
-	movl	36(%esi),%ebx
-	movl	40(%esi),%ecx
-	movl	44(%esi),%edx
-	addl	40(%esp),%eax
-	adcl	44(%esp),%ebx
-	movl	%eax,32(%esi)
-	movl	%ebx,36(%esi)
-	addl	48(%esp),%ecx
-	adcl	52(%esp),%edx
-	movl	%ecx,40(%esi)
-	movl	%edx,44(%esi)
-	movl	48(%esi),%eax
-	movl	52(%esi),%ebx
-	movl	56(%esi),%ecx
-	movl	60(%esi),%edx
-	addl	56(%esp),%eax
-	adcl	60(%esp),%ebx
-	movl	%eax,48(%esi)
-	movl	%ebx,52(%esi)
-	addl	64(%esp),%ecx
-	adcl	68(%esp),%edx
-	movl	%ecx,56(%esi)
-	movl	%edx,60(%esi)
-	addl	$840,%esp
-	subl	$640,%ebp
-	cmpl	8(%esp),%edi
-	jb	.L002loop_x86
-	movl	12(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
 .align	64
 .L001K512:
 .long	3609767458,1116352408
diff --git a/gen/bcm/sha512-586-win.asm b/gen/bcm/sha512-586-win.asm
index 3603a6d..ba55f7d 100644
--- a/gen/bcm/sha512-586-win.asm
+++ b/gen/bcm/sha512-586-win.asm
@@ -40,8 +40,6 @@
 	mov	DWORD [12+esp],ebx
 	lea	edx,[_OPENSSL_ia32cap_P]
 	mov	ecx,DWORD [edx]
-	test	ecx,67108864
-	jz	NEAR L$002loop_x86
 	mov	edx,DWORD [4+edx]
 	movq	mm0,[esi]
 	and	ecx,16777216
@@ -55,11 +53,11 @@
 	movq	mm6,[48+esi]
 	movq	mm7,[56+esi]
 	cmp	ecx,16777728
-	je	NEAR L$003SSSE3
+	je	NEAR L$002SSSE3
 	sub	esp,80
-	jmp	NEAR L$004loop_sse2
+	jmp	NEAR L$003loop_sse2
 align	16
-L$004loop_sse2:
+L$003loop_sse2:
 	movq	[8+esp],mm1
 	movq	[16+esp],mm2
 	movq	[24+esp],mm3
@@ -74,9 +72,9 @@
 	mov	edx,15
 	bswap	eax
 	bswap	ebx
-	jmp	NEAR L$00500_14_sse2
+	jmp	NEAR L$00400_14_sse2
 align	16
-L$00500_14_sse2:
+L$00400_14_sse2:
 	movd	mm1,eax
 	mov	eax,DWORD [edi]
 	movd	mm7,ebx
@@ -137,7 +135,7 @@
 	paddq	mm3,mm6
 	movq	mm6,[48+esp]
 	dec	edx
-	jnz	NEAR L$00500_14_sse2
+	jnz	NEAR L$00400_14_sse2
 	movd	mm1,eax
 	movd	mm7,ebx
 	punpckldq	mm7,mm1
@@ -193,9 +191,9 @@
 	paddq	mm3,mm6
 	pxor	mm0,mm0
 	mov	edx,32
-	jmp	NEAR L$00616_79_sse2
+	jmp	NEAR L$00516_79_sse2
 align	16
-L$00616_79_sse2:
+L$00516_79_sse2:
 	movq	mm5,[88+esp]
 	movq	mm1,mm7
 	psrlq	mm7,1
@@ -349,7 +347,7 @@
 	paddq	mm0,mm6
 	add	ebp,8
 	dec	edx
-	jnz	NEAR L$00616_79_sse2
+	jnz	NEAR L$00516_79_sse2
 	paddq	mm0,mm3
 	movq	mm1,[8+esp]
 	movq	mm3,[24+esp]
@@ -377,7 +375,7 @@
 	lea	esp,[eax*1+esp]
 	sub	ebp,eax
 	cmp	edi,DWORD [88+esp]
-	jb	NEAR L$004loop_sse2
+	jb	NEAR L$003loop_sse2
 	mov	esp,DWORD [92+esp]
 	emms
 	pop	edi
@@ -386,7 +384,7 @@
 	pop	ebp
 	ret
 align	32
-L$003SSSE3:
+L$002SSSE3:
 	lea	edx,[esp-64]
 	sub	esp,256
 	movdqa	xmm1,[640+ebp]
@@ -443,7 +441,7 @@
 	movdqa	[edx-16],xmm2
 	nop
 align	32
-L$007loop_ssse3:
+L$006loop_ssse3:
 	movdqa	xmm2,[16+edx]
 	movdqa	[48+edx],xmm3
 	lea	ebp,[128+ebp]
@@ -460,9 +458,9 @@
 	pxor	mm2,mm1
 	movq	[56+esp],mm7
 	pxor	mm3,mm3
-	jmp	NEAR L$00800_47_ssse3
+	jmp	NEAR L$00700_47_ssse3
 align	32
-L$00800_47_ssse3:
+L$00700_47_ssse3:
 	movdqa	xmm3,xmm5
 	movdqa	xmm1,xmm2
 db	102,15,58,15,208,8
@@ -1481,7 +1479,7 @@
 	movdqa	[edx-16],xmm1
 	lea	ebp,[128+ebp]
 	dec	ecx
-	jnz	NEAR L$00800_47_ssse3
+	jnz	NEAR L$00700_47_ssse3
 	movdqa	xmm1,[ebp]
 	lea	ebp,[ebp-640]
 	movdqu	xmm0,[ebx]
@@ -2293,7 +2291,7 @@
 	movq	[48+esi],mm6
 	movq	[56+esi],mm7
 	cmp	edi,eax
-	jb	NEAR L$007loop_ssse3
+	jb	NEAR L$006loop_ssse3
 	mov	esp,DWORD [76+edx]
 	emms
 	pop	edi
@@ -2301,454 +2299,6 @@
 	pop	ebx
 	pop	ebp
 	ret
-align	16
-L$002loop_x86:
-	mov	eax,DWORD [edi]
-	mov	ebx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	mov	edx,DWORD [12+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [16+edi]
-	mov	ebx,DWORD [20+edi]
-	mov	ecx,DWORD [24+edi]
-	mov	edx,DWORD [28+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [32+edi]
-	mov	ebx,DWORD [36+edi]
-	mov	ecx,DWORD [40+edi]
-	mov	edx,DWORD [44+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [48+edi]
-	mov	ebx,DWORD [52+edi]
-	mov	ecx,DWORD [56+edi]
-	mov	edx,DWORD [60+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [64+edi]
-	mov	ebx,DWORD [68+edi]
-	mov	ecx,DWORD [72+edi]
-	mov	edx,DWORD [76+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [80+edi]
-	mov	ebx,DWORD [84+edi]
-	mov	ecx,DWORD [88+edi]
-	mov	edx,DWORD [92+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [96+edi]
-	mov	ebx,DWORD [100+edi]
-	mov	ecx,DWORD [104+edi]
-	mov	edx,DWORD [108+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	mov	eax,DWORD [112+edi]
-	mov	ebx,DWORD [116+edi]
-	mov	ecx,DWORD [120+edi]
-	mov	edx,DWORD [124+edi]
-	bswap	eax
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	push	eax
-	push	ebx
-	push	ecx
-	push	edx
-	add	edi,128
-	sub	esp,72
-	mov	DWORD [204+esp],edi
-	lea	edi,[8+esp]
-	mov	ecx,16
-dd	2784229001
-align	16
-L$00900_15_x86:
-	mov	ecx,DWORD [40+esp]
-	mov	edx,DWORD [44+esp]
-	mov	esi,ecx
-	shr	ecx,9
-	mov	edi,edx
-	shr	edx,9
-	mov	ebx,ecx
-	shl	esi,14
-	mov	eax,edx
-	shl	edi,14
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	eax,ecx
-	shl	esi,4
-	xor	ebx,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,4
-	xor	eax,edi
-	shr	edx,4
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [48+esp]
-	mov	edx,DWORD [52+esp]
-	mov	esi,DWORD [56+esp]
-	mov	edi,DWORD [60+esp]
-	add	eax,DWORD [64+esp]
-	adc	ebx,DWORD [68+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	and	ecx,DWORD [40+esp]
-	and	edx,DWORD [44+esp]
-	add	eax,DWORD [192+esp]
-	adc	ebx,DWORD [196+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	mov	esi,DWORD [ebp]
-	mov	edi,DWORD [4+ebp]
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [32+esp]
-	mov	edx,DWORD [36+esp]
-	add	eax,esi
-	adc	ebx,edi
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	DWORD [32+esp],eax
-	mov	DWORD [36+esp],ebx
-	mov	esi,ecx
-	shr	ecx,2
-	mov	edi,edx
-	shr	edx,2
-	mov	ebx,ecx
-	shl	esi,4
-	mov	eax,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	ebx,ecx
-	shl	esi,21
-	xor	eax,edx
-	shl	edi,21
-	xor	eax,esi
-	shr	ecx,21
-	xor	ebx,edi
-	shr	edx,21
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	esi,DWORD [16+esp]
-	mov	edi,DWORD [20+esp]
-	add	eax,DWORD [esp]
-	adc	ebx,DWORD [4+esp]
-	or	ecx,esi
-	or	edx,edi
-	and	ecx,DWORD [24+esp]
-	and	edx,DWORD [28+esp]
-	and	esi,DWORD [8+esp]
-	and	edi,DWORD [12+esp]
-	or	ecx,esi
-	or	edx,edi
-	add	eax,ecx
-	adc	ebx,edx
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	mov	dl,BYTE [ebp]
-	sub	esp,8
-	lea	ebp,[8+ebp]
-	cmp	dl,148
-	jne	NEAR L$00900_15_x86
-align	16
-L$01016_79_x86:
-	mov	ecx,DWORD [312+esp]
-	mov	edx,DWORD [316+esp]
-	mov	esi,ecx
-	shr	ecx,1
-	mov	edi,edx
-	shr	edx,1
-	mov	eax,ecx
-	shl	esi,24
-	mov	ebx,edx
-	shl	edi,24
-	xor	ebx,esi
-	shr	ecx,6
-	xor	eax,edi
-	shr	edx,6
-	xor	eax,ecx
-	shl	esi,7
-	xor	ebx,edx
-	shl	edi,1
-	xor	ebx,esi
-	shr	ecx,1
-	xor	eax,edi
-	shr	edx,1
-	xor	eax,ecx
-	shl	edi,6
-	xor	ebx,edx
-	xor	eax,edi
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	mov	ecx,DWORD [208+esp]
-	mov	edx,DWORD [212+esp]
-	mov	esi,ecx
-	shr	ecx,6
-	mov	edi,edx
-	shr	edx,6
-	mov	eax,ecx
-	shl	esi,3
-	mov	ebx,edx
-	shl	edi,3
-	xor	eax,esi
-	shr	ecx,13
-	xor	ebx,edi
-	shr	edx,13
-	xor	eax,ecx
-	shl	esi,10
-	xor	ebx,edx
-	shl	edi,10
-	xor	ebx,esi
-	shr	ecx,10
-	xor	eax,edi
-	shr	edx,10
-	xor	ebx,ecx
-	shl	edi,13
-	xor	eax,edx
-	xor	eax,edi
-	mov	ecx,DWORD [320+esp]
-	mov	edx,DWORD [324+esp]
-	add	eax,DWORD [esp]
-	adc	ebx,DWORD [4+esp]
-	mov	esi,DWORD [248+esp]
-	mov	edi,DWORD [252+esp]
-	add	eax,ecx
-	adc	ebx,edx
-	add	eax,esi
-	adc	ebx,edi
-	mov	DWORD [192+esp],eax
-	mov	DWORD [196+esp],ebx
-	mov	ecx,DWORD [40+esp]
-	mov	edx,DWORD [44+esp]
-	mov	esi,ecx
-	shr	ecx,9
-	mov	edi,edx
-	shr	edx,9
-	mov	ebx,ecx
-	shl	esi,14
-	mov	eax,edx
-	shl	edi,14
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	eax,ecx
-	shl	esi,4
-	xor	ebx,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,4
-	xor	eax,edi
-	shr	edx,4
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [48+esp]
-	mov	edx,DWORD [52+esp]
-	mov	esi,DWORD [56+esp]
-	mov	edi,DWORD [60+esp]
-	add	eax,DWORD [64+esp]
-	adc	ebx,DWORD [68+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	and	ecx,DWORD [40+esp]
-	and	edx,DWORD [44+esp]
-	add	eax,DWORD [192+esp]
-	adc	ebx,DWORD [196+esp]
-	xor	ecx,esi
-	xor	edx,edi
-	mov	esi,DWORD [ebp]
-	mov	edi,DWORD [4+ebp]
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [32+esp]
-	mov	edx,DWORD [36+esp]
-	add	eax,esi
-	adc	ebx,edi
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	add	eax,ecx
-	adc	ebx,edx
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	DWORD [32+esp],eax
-	mov	DWORD [36+esp],ebx
-	mov	esi,ecx
-	shr	ecx,2
-	mov	edi,edx
-	shr	edx,2
-	mov	ebx,ecx
-	shl	esi,4
-	mov	eax,edx
-	shl	edi,4
-	xor	ebx,esi
-	shr	ecx,5
-	xor	eax,edi
-	shr	edx,5
-	xor	ebx,ecx
-	shl	esi,21
-	xor	eax,edx
-	shl	edi,21
-	xor	eax,esi
-	shr	ecx,21
-	xor	ebx,edi
-	shr	edx,21
-	xor	eax,ecx
-	shl	esi,5
-	xor	ebx,edx
-	shl	edi,5
-	xor	eax,esi
-	xor	ebx,edi
-	mov	ecx,DWORD [8+esp]
-	mov	edx,DWORD [12+esp]
-	mov	esi,DWORD [16+esp]
-	mov	edi,DWORD [20+esp]
-	add	eax,DWORD [esp]
-	adc	ebx,DWORD [4+esp]
-	or	ecx,esi
-	or	edx,edi
-	and	ecx,DWORD [24+esp]
-	and	edx,DWORD [28+esp]
-	and	esi,DWORD [8+esp]
-	and	edi,DWORD [12+esp]
-	or	ecx,esi
-	or	edx,edi
-	add	eax,ecx
-	adc	ebx,edx
-	mov	DWORD [esp],eax
-	mov	DWORD [4+esp],ebx
-	mov	dl,BYTE [ebp]
-	sub	esp,8
-	lea	ebp,[8+ebp]
-	cmp	dl,23
-	jne	NEAR L$01016_79_x86
-	mov	esi,DWORD [840+esp]
-	mov	edi,DWORD [844+esp]
-	mov	eax,DWORD [esi]
-	mov	ebx,DWORD [4+esi]
-	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [12+esi]
-	add	eax,DWORD [8+esp]
-	adc	ebx,DWORD [12+esp]
-	mov	DWORD [esi],eax
-	mov	DWORD [4+esi],ebx
-	add	ecx,DWORD [16+esp]
-	adc	edx,DWORD [20+esp]
-	mov	DWORD [8+esi],ecx
-	mov	DWORD [12+esi],edx
-	mov	eax,DWORD [16+esi]
-	mov	ebx,DWORD [20+esi]
-	mov	ecx,DWORD [24+esi]
-	mov	edx,DWORD [28+esi]
-	add	eax,DWORD [24+esp]
-	adc	ebx,DWORD [28+esp]
-	mov	DWORD [16+esi],eax
-	mov	DWORD [20+esi],ebx
-	add	ecx,DWORD [32+esp]
-	adc	edx,DWORD [36+esp]
-	mov	DWORD [24+esi],ecx
-	mov	DWORD [28+esi],edx
-	mov	eax,DWORD [32+esi]
-	mov	ebx,DWORD [36+esi]
-	mov	ecx,DWORD [40+esi]
-	mov	edx,DWORD [44+esi]
-	add	eax,DWORD [40+esp]
-	adc	ebx,DWORD [44+esp]
-	mov	DWORD [32+esi],eax
-	mov	DWORD [36+esi],ebx
-	add	ecx,DWORD [48+esp]
-	adc	edx,DWORD [52+esp]
-	mov	DWORD [40+esi],ecx
-	mov	DWORD [44+esi],edx
-	mov	eax,DWORD [48+esi]
-	mov	ebx,DWORD [52+esi]
-	mov	ecx,DWORD [56+esi]
-	mov	edx,DWORD [60+esi]
-	add	eax,DWORD [56+esp]
-	adc	ebx,DWORD [60+esp]
-	mov	DWORD [48+esi],eax
-	mov	DWORD [52+esi],ebx
-	add	ecx,DWORD [64+esp]
-	adc	edx,DWORD [68+esp]
-	mov	DWORD [56+esi],ecx
-	mov	DWORD [60+esi],edx
-	add	esp,840
-	sub	ebp,640
-	cmp	edi,DWORD [8+esp]
-	jb	NEAR L$002loop_x86
-	mov	esp,DWORD [12+esp]
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
 align	64
 L$001K512:
 dd	3609767458,1116352408
diff --git a/gen/bcm/x86-mont-apple.S b/gen/bcm/x86-mont-apple.S
index f991f6c..a8fd1f9 100644
--- a/gen/bcm/x86-mont-apple.S
+++ b/gen/bcm/x86-mont-apple.S
@@ -62,12 +62,6 @@
 	movl	%esi,20(%esp)
 	leal	-3(%edi),%ebx
 	movl	%edx,24(%esp)
-	call	L003PIC_me_up
-L003PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L004non_sse2
 	movl	$-1,%eax
 	movd	%eax,%mm7
 	movl	8(%esp),%esi
@@ -91,7 +85,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	4,0x90
-L0051st:
+L0031st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -106,7 +100,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	L0051st
+	jl	L0031st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -120,7 +114,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-L006outer:
+L004outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -142,7 +136,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-L007inner:
+L005inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -159,7 +153,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	L007inner
+	jnz	L005inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -177,264 +171,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	L006outer
+	jle	L004outer
 	emms
-	jmp	L008common_tail
+	jmp	L006common_tail
 .align	4,0x90
-L004non_sse2:
-	movl	8(%esp),%esi
-	leal	1(%ebx),%ebp
-	movl	12(%esp),%edi
-	xorl	%ecx,%ecx
-	movl	%esi,%edx
-	andl	$1,%ebp
-	subl	%edi,%edx
-	leal	4(%edi,%ebx,4),%eax
-	orl	%edx,%ebp
-	movl	(%edi),%edi
-	jz	L009bn_sqr_mont
-	movl	%eax,28(%esp)
-	movl	(%esi),%eax
-	xorl	%edx,%edx
-.align	4,0x90
-L010mull:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%eax,%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	movl	(%esi,%ecx,4),%eax
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	L010mull
-	movl	%edx,%ebp
-	mull	%edi
-	movl	20(%esp),%edi
-	addl	%ebp,%eax
-	movl	16(%esp),%esi
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	movl	%eax,32(%esp,%ebx,4)
-	xorl	%ecx,%ecx
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	movl	(%esi),%eax
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	incl	%ecx
-	jmp	L0112ndmadd
-.align	4,0x90
-L0121stmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	L0121stmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	xorl	%ecx,%ecx
-	addl	36(%esp,%ebx,4),%edx
-	movl	%ebp,32(%esp,%ebx,4)
-	adcl	$0,%ecx
-	movl	(%esi),%eax
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	movl	$1,%ecx
-.align	4,0x90
-L0112ndmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	L0112ndmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	xorl	%eax,%eax
-	movl	12(%esp),%ecx
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	leal	4(%ecx),%ecx
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	28(%esp),%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	L008common_tail
-	movl	(%ecx),%edi
-	movl	8(%esp),%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%ecx
-	xorl	%edx,%edx
-	movl	(%esi),%eax
-	jmp	L0121stmadd
-.align	4,0x90
-L009bn_sqr_mont:
-	movl	%ebx,(%esp)
-	movl	%ecx,12(%esp)
-	movl	%edi,%eax
-	mull	%edi
-	movl	%eax,32(%esp)
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-	incl	%ecx
-.align	4,0x90
-L013sqr:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	leal	(%ebx,%eax,2),%ebp
-	shrl	$31,%eax
-	cmpl	(%esp),%ecx
-	movl	%eax,%ebx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	L013sqr
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	leal	(%ebx,%eax,2),%ebp
-	imull	32(%esp),%edi
-	shrl	$31,%eax
-	movl	%ebp,32(%esp,%ecx,4)
-	leal	(%eax,%edx,2),%ebp
-	movl	(%esi),%eax
-	shrl	$31,%edx
-	movl	%ebp,36(%esp,%ecx,4)
-	movl	%edx,40(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	%ecx,%ebx
-	adcl	$0,%edx
-	movl	4(%esi),%eax
-	movl	$1,%ecx
-.align	4,0x90
-L0143rdmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	4(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%edx,%ebp
-	mull	%edi
-	addl	36(%esp,%ecx,4),%ebp
-	leal	2(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	L0143rdmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	movl	12(%esp),%ecx
-	xorl	%eax,%eax
-	movl	8(%esp),%esi
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	%ebx,%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	L008common_tail
-	movl	4(%esi,%ecx,4),%edi
-	leal	1(%ecx),%ecx
-	movl	%edi,%eax
-	movl	%ecx,12(%esp)
-	mull	%edi
-	addl	32(%esp,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%eax,32(%esp,%ecx,4)
-	xorl	%ebp,%ebp
-	cmpl	%ebx,%ecx
-	leal	1(%ecx),%ecx
-	je	L015sqrlast
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-.align	4,0x90
-L016sqradd:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	(%eax,%eax,1),%ebp
-	adcl	$0,%edx
-	shrl	$31,%eax
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%eax
-	addl	%ebx,%ebp
-	adcl	$0,%eax
-	cmpl	(%esp),%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%eax,%ebx
-	jle	L016sqradd
-	movl	%edx,%ebp
-	addl	%edx,%edx
-	shrl	$31,%ebp
-	addl	%ebx,%edx
-	adcl	$0,%ebp
-L015sqrlast:
-	movl	20(%esp),%edi
-	movl	16(%esp),%esi
-	imull	32(%esp),%edi
-	addl	32(%esp,%ecx,4),%edx
-	movl	(%esi),%eax
-	adcl	$0,%ebp
-	movl	%edx,32(%esp,%ecx,4)
-	movl	%ebp,36(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	leal	-1(%ecx),%ebx
-	adcl	$0,%edx
-	movl	$1,%ecx
-	movl	4(%esi),%eax
-	jmp	L0143rdmadd
-.align	4,0x90
-L008common_tail:
+L006common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -442,19 +183,19 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	4,0x90
-L017sub:
+L007sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	L017sub
+	jge	L007sub
 	sbbl	$0,%eax
 	movl	$-1,%edx
 	xorl	%eax,%edx
-	jmp	L018copy
+	jmp	L008copy
 .align	4,0x90
-L018copy:
+L008copy:
 	movl	32(%esp,%ebx,4),%esi
 	movl	(%edi,%ebx,4),%ebp
 	movl	%ecx,32(%esp,%ebx,4)
@@ -463,7 +204,7 @@
 	orl	%esi,%ebp
 	movl	%ebp,(%edi,%ebx,4)
 	decl	%ebx
-	jge	L018copy
+	jge	L008copy
 	movl	24(%esp),%esp
 	movl	$1,%eax
 L000just_leave:
@@ -477,8 +218,4 @@
 .byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
 .byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
 .byte	111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
 #endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/x86-mont-linux.S b/gen/bcm/x86-mont-linux.S
index e6b4ef5..3d3ddb5 100644
--- a/gen/bcm/x86-mont-linux.S
+++ b/gen/bcm/x86-mont-linux.S
@@ -63,12 +63,6 @@
 	movl	%esi,20(%esp)
 	leal	-3(%edi),%ebx
 	movl	%edx,24(%esp)
-	call	.L003PIC_me_up
-.L003PIC_me_up:
-	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	.L004non_sse2
 	movl	$-1,%eax
 	movd	%eax,%mm7
 	movl	8(%esp),%esi
@@ -92,7 +86,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	16
-.L0051st:
+.L0031st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -107,7 +101,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	.L0051st
+	jl	.L0031st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -121,7 +115,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-.L006outer:
+.L004outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -143,7 +137,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-.L007inner:
+.L005inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -160,7 +154,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	.L007inner
+	jnz	.L005inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -178,264 +172,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	.L006outer
+	jle	.L004outer
 	emms
-	jmp	.L008common_tail
+	jmp	.L006common_tail
 .align	16
-.L004non_sse2:
-	movl	8(%esp),%esi
-	leal	1(%ebx),%ebp
-	movl	12(%esp),%edi
-	xorl	%ecx,%ecx
-	movl	%esi,%edx
-	andl	$1,%ebp
-	subl	%edi,%edx
-	leal	4(%edi,%ebx,4),%eax
-	orl	%edx,%ebp
-	movl	(%edi),%edi
-	jz	.L009bn_sqr_mont
-	movl	%eax,28(%esp)
-	movl	(%esi),%eax
-	xorl	%edx,%edx
-.align	16
-.L010mull:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%eax,%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	movl	(%esi,%ecx,4),%eax
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L010mull
-	movl	%edx,%ebp
-	mull	%edi
-	movl	20(%esp),%edi
-	addl	%ebp,%eax
-	movl	16(%esp),%esi
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	movl	%eax,32(%esp,%ebx,4)
-	xorl	%ecx,%ecx
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	movl	(%esi),%eax
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	incl	%ecx
-	jmp	.L0112ndmadd
-.align	16
-.L0121stmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L0121stmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	xorl	%ecx,%ecx
-	addl	36(%esp,%ebx,4),%edx
-	movl	%ebp,32(%esp,%ebx,4)
-	adcl	$0,%ecx
-	movl	(%esi),%eax
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	movl	$1,%ecx
-.align	16
-.L0112ndmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0112ndmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	xorl	%eax,%eax
-	movl	12(%esp),%ecx
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	leal	4(%ecx),%ecx
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	28(%esp),%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	.L008common_tail
-	movl	(%ecx),%edi
-	movl	8(%esp),%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%ecx
-	xorl	%edx,%edx
-	movl	(%esi),%eax
-	jmp	.L0121stmadd
-.align	16
-.L009bn_sqr_mont:
-	movl	%ebx,(%esp)
-	movl	%ecx,12(%esp)
-	movl	%edi,%eax
-	mull	%edi
-	movl	%eax,32(%esp)
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-	incl	%ecx
-.align	16
-.L013sqr:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	leal	(%ebx,%eax,2),%ebp
-	shrl	$31,%eax
-	cmpl	(%esp),%ecx
-	movl	%eax,%ebx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L013sqr
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	leal	(%ebx,%eax,2),%ebp
-	imull	32(%esp),%edi
-	shrl	$31,%eax
-	movl	%ebp,32(%esp,%ecx,4)
-	leal	(%eax,%edx,2),%ebp
-	movl	(%esi),%eax
-	shrl	$31,%edx
-	movl	%ebp,36(%esp,%ecx,4)
-	movl	%edx,40(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	%ecx,%ebx
-	adcl	$0,%edx
-	movl	4(%esi),%eax
-	movl	$1,%ecx
-.align	16
-.L0143rdmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	4(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%edx,%ebp
-	mull	%edi
-	addl	36(%esp,%ecx,4),%ebp
-	leal	2(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0143rdmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	movl	12(%esp),%ecx
-	xorl	%eax,%eax
-	movl	8(%esp),%esi
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	%ebx,%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	.L008common_tail
-	movl	4(%esi,%ecx,4),%edi
-	leal	1(%ecx),%ecx
-	movl	%edi,%eax
-	movl	%ecx,12(%esp)
-	mull	%edi
-	addl	32(%esp,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%eax,32(%esp,%ecx,4)
-	xorl	%ebp,%ebp
-	cmpl	%ebx,%ecx
-	leal	1(%ecx),%ecx
-	je	.L015sqrlast
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-.align	16
-.L016sqradd:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	(%eax,%eax,1),%ebp
-	adcl	$0,%edx
-	shrl	$31,%eax
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%eax
-	addl	%ebx,%ebp
-	adcl	$0,%eax
-	cmpl	(%esp),%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%eax,%ebx
-	jle	.L016sqradd
-	movl	%edx,%ebp
-	addl	%edx,%edx
-	shrl	$31,%ebp
-	addl	%ebx,%edx
-	adcl	$0,%ebp
-.L015sqrlast:
-	movl	20(%esp),%edi
-	movl	16(%esp),%esi
-	imull	32(%esp),%edi
-	addl	32(%esp,%ecx,4),%edx
-	movl	(%esi),%eax
-	adcl	$0,%ebp
-	movl	%edx,32(%esp,%ecx,4)
-	movl	%ebp,36(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	leal	-1(%ecx),%ebx
-	adcl	$0,%edx
-	movl	$1,%ecx
-	movl	4(%esi),%eax
-	jmp	.L0143rdmadd
-.align	16
-.L008common_tail:
+.L006common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -443,19 +184,19 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	16
-.L017sub:
+.L007sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	.L017sub
+	jge	.L007sub
 	sbbl	$0,%eax
 	movl	$-1,%edx
 	xorl	%eax,%edx
-	jmp	.L018copy
+	jmp	.L008copy
 .align	16
-.L018copy:
+.L008copy:
 	movl	32(%esp,%ebx,4),%esi
 	movl	(%edi,%ebx,4),%ebp
 	movl	%ecx,32(%esp,%ebx,4)
@@ -464,7 +205,7 @@
 	orl	%esi,%ebp
 	movl	%ebp,(%edi,%ebx,4)
 	decl	%ebx
-	jge	.L018copy
+	jge	.L008copy
 	movl	24(%esp),%esp
 	movl	$1,%eax
 .L000just_leave:
diff --git a/gen/bcm/x86-mont-win.asm b/gen/bcm/x86-mont-win.asm
index cd77529..931275d 100644
--- a/gen/bcm/x86-mont-win.asm
+++ b/gen/bcm/x86-mont-win.asm
@@ -13,7 +13,6 @@
 %else
 section	.text	code
 %endif
-;extern	_OPENSSL_ia32cap_P
 global	_bn_mul_mont
 align	16
 _bn_mul_mont:
@@ -70,9 +69,6 @@
 	mov	DWORD [20+esp],esi
 	lea	ebx,[edi-3]
 	mov	DWORD [24+esp],edx
-	lea	eax,[_OPENSSL_ia32cap_P]
-	bt	DWORD [eax],26
-	jnc	NEAR L$003non_sse2
 	mov	eax,-1
 	movd	mm7,eax
 	mov	esi,DWORD [8+esp]
@@ -96,7 +92,7 @@
 	psrlq	mm3,32
 	inc	ecx
 align	16
-L$0041st:
+L$0031st:
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -111,7 +107,7 @@
 	psrlq	mm3,32
 	lea	ecx,[1+ecx]
 	cmp	ecx,ebx
-	jl	NEAR L$0041st
+	jl	NEAR L$0031st
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -125,7 +121,7 @@
 	paddq	mm3,mm2
 	movq	[32+ebx*4+esp],mm3
 	inc	edx
-L$005outer:
+L$004outer:
 	xor	ecx,ecx
 	movd	mm4,DWORD [edx*4+edi]
 	movd	mm5,DWORD [esi]
@@ -147,7 +143,7 @@
 	paddq	mm2,mm6
 	inc	ecx
 	dec	ebx
-L$006inner:
+L$005inner:
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -164,7 +160,7 @@
 	paddq	mm2,mm6
 	dec	ebx
 	lea	ecx,[1+ecx]
-	jnz	NEAR L$006inner
+	jnz	NEAR L$005inner
 	mov	ebx,ecx
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
@@ -182,264 +178,11 @@
 	movq	[32+ebx*4+esp],mm3
 	lea	edx,[1+edx]
 	cmp	edx,ebx
-	jle	NEAR L$005outer
+	jle	NEAR L$004outer
 	emms
-	jmp	NEAR L$007common_tail
+	jmp	NEAR L$006common_tail
 align	16
-L$003non_sse2:
-	mov	esi,DWORD [8+esp]
-	lea	ebp,[1+ebx]
-	mov	edi,DWORD [12+esp]
-	xor	ecx,ecx
-	mov	edx,esi
-	and	ebp,1
-	sub	edx,edi
-	lea	eax,[4+ebx*4+edi]
-	or	ebp,edx
-	mov	edi,DWORD [edi]
-	jz	NEAR L$008bn_sqr_mont
-	mov	DWORD [28+esp],eax
-	mov	eax,DWORD [esi]
-	xor	edx,edx
-align	16
-L$009mull:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,eax
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	mov	eax,DWORD [ecx*4+esi]
-	cmp	ecx,ebx
-	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$009mull
-	mov	ebp,edx
-	mul	edi
-	mov	edi,DWORD [20+esp]
-	add	eax,ebp
-	mov	esi,DWORD [16+esp]
-	adc	edx,0
-	imul	edi,DWORD [32+esp]
-	mov	DWORD [32+ebx*4+esp],eax
-	xor	ecx,ecx
-	mov	DWORD [36+ebx*4+esp],edx
-	mov	DWORD [40+ebx*4+esp],ecx
-	mov	eax,DWORD [esi]
-	mul	edi
-	add	eax,DWORD [32+esp]
-	mov	eax,DWORD [4+esi]
-	adc	edx,0
-	inc	ecx
-	jmp	NEAR L$0102ndmadd
-align	16
-L$0111stmadd:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ecx*4+esp]
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [ecx*4+esi]
-	adc	edx,0
-	cmp	ecx,ebx
-	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$0111stmadd
-	mov	ebp,edx
-	mul	edi
-	add	eax,DWORD [32+ebx*4+esp]
-	mov	edi,DWORD [20+esp]
-	adc	edx,0
-	mov	esi,DWORD [16+esp]
-	add	ebp,eax
-	adc	edx,0
-	imul	edi,DWORD [32+esp]
-	xor	ecx,ecx
-	add	edx,DWORD [36+ebx*4+esp]
-	mov	DWORD [32+ebx*4+esp],ebp
-	adc	ecx,0
-	mov	eax,DWORD [esi]
-	mov	DWORD [36+ebx*4+esp],edx
-	mov	DWORD [40+ebx*4+esp],ecx
-	mul	edi
-	add	eax,DWORD [32+esp]
-	mov	eax,DWORD [4+esi]
-	adc	edx,0
-	mov	ecx,1
-align	16
-L$0102ndmadd:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ecx*4+esp]
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [ecx*4+esi]
-	adc	edx,0
-	cmp	ecx,ebx
-	mov	DWORD [24+ecx*4+esp],ebp
-	jl	NEAR L$0102ndmadd
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ebx*4+esp]
-	adc	edx,0
-	add	ebp,eax
-	adc	edx,0
-	mov	DWORD [28+ebx*4+esp],ebp
-	xor	eax,eax
-	mov	ecx,DWORD [12+esp]
-	add	edx,DWORD [36+ebx*4+esp]
-	adc	eax,DWORD [40+ebx*4+esp]
-	lea	ecx,[4+ecx]
-	mov	DWORD [32+ebx*4+esp],edx
-	cmp	ecx,DWORD [28+esp]
-	mov	DWORD [36+ebx*4+esp],eax
-	je	NEAR L$007common_tail
-	mov	edi,DWORD [ecx]
-	mov	esi,DWORD [8+esp]
-	mov	DWORD [12+esp],ecx
-	xor	ecx,ecx
-	xor	edx,edx
-	mov	eax,DWORD [esi]
-	jmp	NEAR L$0111stmadd
-align	16
-L$008bn_sqr_mont:
-	mov	DWORD [esp],ebx
-	mov	DWORD [12+esp],ecx
-	mov	eax,edi
-	mul	edi
-	mov	DWORD [32+esp],eax
-	mov	ebx,edx
-	shr	edx,1
-	and	ebx,1
-	inc	ecx
-align	16
-L$012sqr:
-	mov	eax,DWORD [ecx*4+esi]
-	mov	ebp,edx
-	mul	edi
-	add	eax,ebp
-	lea	ecx,[1+ecx]
-	adc	edx,0
-	lea	ebp,[eax*2+ebx]
-	shr	eax,31
-	cmp	ecx,DWORD [esp]
-	mov	ebx,eax
-	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$012sqr
-	mov	eax,DWORD [ecx*4+esi]
-	mov	ebp,edx
-	mul	edi
-	add	eax,ebp
-	mov	edi,DWORD [20+esp]
-	adc	edx,0
-	mov	esi,DWORD [16+esp]
-	lea	ebp,[eax*2+ebx]
-	imul	edi,DWORD [32+esp]
-	shr	eax,31
-	mov	DWORD [32+ecx*4+esp],ebp
-	lea	ebp,[edx*2+eax]
-	mov	eax,DWORD [esi]
-	shr	edx,31
-	mov	DWORD [36+ecx*4+esp],ebp
-	mov	DWORD [40+ecx*4+esp],edx
-	mul	edi
-	add	eax,DWORD [32+esp]
-	mov	ebx,ecx
-	adc	edx,0
-	mov	eax,DWORD [4+esi]
-	mov	ecx,1
-align	16
-L$0133rdmadd:
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ecx*4+esp]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [4+ecx*4+esi]
-	adc	edx,0
-	mov	DWORD [28+ecx*4+esp],ebp
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [36+ecx*4+esp]
-	lea	ecx,[2+ecx]
-	adc	edx,0
-	add	ebp,eax
-	mov	eax,DWORD [ecx*4+esi]
-	adc	edx,0
-	cmp	ecx,ebx
-	mov	DWORD [24+ecx*4+esp],ebp
-	jl	NEAR L$0133rdmadd
-	mov	ebp,edx
-	mul	edi
-	add	ebp,DWORD [32+ebx*4+esp]
-	adc	edx,0
-	add	ebp,eax
-	adc	edx,0
-	mov	DWORD [28+ebx*4+esp],ebp
-	mov	ecx,DWORD [12+esp]
-	xor	eax,eax
-	mov	esi,DWORD [8+esp]
-	add	edx,DWORD [36+ebx*4+esp]
-	adc	eax,DWORD [40+ebx*4+esp]
-	mov	DWORD [32+ebx*4+esp],edx
-	cmp	ecx,ebx
-	mov	DWORD [36+ebx*4+esp],eax
-	je	NEAR L$007common_tail
-	mov	edi,DWORD [4+ecx*4+esi]
-	lea	ecx,[1+ecx]
-	mov	eax,edi
-	mov	DWORD [12+esp],ecx
-	mul	edi
-	add	eax,DWORD [32+ecx*4+esp]
-	adc	edx,0
-	mov	DWORD [32+ecx*4+esp],eax
-	xor	ebp,ebp
-	cmp	ecx,ebx
-	lea	ecx,[1+ecx]
-	je	NEAR L$014sqrlast
-	mov	ebx,edx
-	shr	edx,1
-	and	ebx,1
-align	16
-L$015sqradd:
-	mov	eax,DWORD [ecx*4+esi]
-	mov	ebp,edx
-	mul	edi
-	add	eax,ebp
-	lea	ebp,[eax*1+eax]
-	adc	edx,0
-	shr	eax,31
-	add	ebp,DWORD [32+ecx*4+esp]
-	lea	ecx,[1+ecx]
-	adc	eax,0
-	add	ebp,ebx
-	adc	eax,0
-	cmp	ecx,DWORD [esp]
-	mov	DWORD [28+ecx*4+esp],ebp
-	mov	ebx,eax
-	jle	NEAR L$015sqradd
-	mov	ebp,edx
-	add	edx,edx
-	shr	ebp,31
-	add	edx,ebx
-	adc	ebp,0
-L$014sqrlast:
-	mov	edi,DWORD [20+esp]
-	mov	esi,DWORD [16+esp]
-	imul	edi,DWORD [32+esp]
-	add	edx,DWORD [32+ecx*4+esp]
-	mov	eax,DWORD [esi]
-	adc	ebp,0
-	mov	DWORD [32+ecx*4+esp],edx
-	mov	DWORD [36+ecx*4+esp],ebp
-	mul	edi
-	add	eax,DWORD [32+esp]
-	lea	ebx,[ecx-1]
-	adc	edx,0
-	mov	ecx,1
-	mov	eax,DWORD [4+esi]
-	jmp	NEAR L$0133rdmadd
-align	16
-L$007common_tail:
+L$006common_tail:
 	mov	ebp,DWORD [16+esp]
 	mov	edi,DWORD [4+esp]
 	lea	esi,[32+esp]
@@ -447,19 +190,19 @@
 	mov	ecx,ebx
 	xor	edx,edx
 align	16
-L$016sub:
+L$007sub:
 	sbb	eax,DWORD [edx*4+ebp]
 	mov	DWORD [edx*4+edi],eax
 	dec	ecx
 	mov	eax,DWORD [4+edx*4+esi]
 	lea	edx,[1+edx]
-	jge	NEAR L$016sub
+	jge	NEAR L$007sub
 	sbb	eax,0
 	mov	edx,-1
 	xor	edx,eax
-	jmp	NEAR L$017copy
+	jmp	NEAR L$008copy
 align	16
-L$017copy:
+L$008copy:
 	mov	esi,DWORD [32+ebx*4+esp]
 	mov	ebp,DWORD [ebx*4+edi]
 	mov	DWORD [32+ebx*4+esp],ecx
@@ -468,7 +211,7 @@
 	or	ebp,esi
 	mov	DWORD [ebx*4+edi],ebp
 	dec	ebx
-	jge	NEAR L$017copy
+	jge	NEAR L$008copy
 	mov	esp,DWORD [24+esp]
 	mov	eax,1
 L$000just_leave:
@@ -482,8 +225,6 @@
 db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
 db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
 db	111,114,103,62,0
-segment	.bss
-common	_OPENSSL_ia32cap_P 16
 %else
 ; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
 ret