Remove SSE2 checks in 32-bit x86 assembly
We've made crypto/internal.h require SSE2 support for a few months now
without much fuss. Finish the job and remove the fallback paths. We've
never tested any of these paths, and this removes a slew of
OPENSSL_ia32cap_P references from the assembly.
Bug: 673
Change-Id: I446a033d132af5038ab427b8560cbf20c1d97335
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68207
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/bn/asm/bn-586.pl b/crypto/fipsmodule/bn/asm/bn-586.pl
index eae6da9..e4fc0f9 100644
--- a/crypto/fipsmodule/bn/asm/bn-586.pl
+++ b/crypto/fipsmodule/bn/asm/bn-586.pl
@@ -18,8 +18,6 @@
$sse2=1;
-&external_label("OPENSSL_ia32cap_P") if ($sse2);
-
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
@@ -35,17 +33,13 @@
{
local($name)=@_;
- &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+ &function_begin_B($name);
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
- &picmeup("eax","OPENSSL_ia32cap_P");
- &bt(&DWP(0,"eax"),26);
- &jnc(&label("maw_non_sse2"));
-
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
@@ -135,85 +129,7 @@
&movd("eax","mm1"); # c = carry_out
&emms();
&ret();
-
- &set_label("maw_non_sse2",16);
}
-
- # function_begin prologue
- &push("ebp");
- &push("ebx");
- &push("esi");
- &push("edi");
-
- &comment("");
- $Low="eax";
- $High="edx";
- $a="ebx";
- $w="ebp";
- $r="edi";
- $c="esi";
-
- &xor($c,$c); # clear carry
- &mov($r,&wparam(0)); #
-
- &mov("ecx",&wparam(2)); #
- &mov($a,&wparam(1)); #
-
- &and("ecx",0xfffffff8); # num / 8
- &mov($w,&wparam(3)); #
-
- &push("ecx"); # Up the stack for a tmp variable
-
- &jz(&label("maw_finish"));
-
- &set_label("maw_loop",16);
-
- for ($i=0; $i<32; $i+=4)
- {
- &comment("Round $i");
-
- &mov("eax",&DWP($i,$a)); # *a
- &mul($w); # *a * w
- &add("eax",$c); # L(t)+= c
- &adc("edx",0); # H(t)+=carry
- &add("eax",&DWP($i,$r)); # L(t)+= *r
- &adc("edx",0); # H(t)+=carry
- &mov(&DWP($i,$r),"eax"); # *r= L(t);
- &mov($c,"edx"); # c= H(t);
- }
-
- &comment("");
- &sub("ecx",8);
- &lea($a,&DWP(32,$a));
- &lea($r,&DWP(32,$r));
- &jnz(&label("maw_loop"));
-
- &set_label("maw_finish",0);
- &mov("ecx",&wparam(2)); # get num
- &and("ecx",7);
- &jnz(&label("maw_finish2")); # helps branch prediction
- &jmp(&label("maw_end"));
-
- &set_label("maw_finish2",1);
- for ($i=0; $i<7; $i++)
- {
- &comment("Tail Round $i");
- &mov("eax",&DWP($i*4,$a)); # *a
- &mul($w); # *a * w
- &add("eax",$c); # L(t)+=c
- &adc("edx",0); # H(t)+=carry
- &add("eax",&DWP($i*4,$r)); # L(t)+= *r
- &adc("edx",0); # H(t)+=carry
- &dec("ecx") if ($i != 7-1);
- &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
- &mov($c,"edx"); # c= H(t);
- &jz(&label("maw_end")) if ($i != 7-1);
- }
- &set_label("maw_end",0);
- &mov("eax",$c);
-
- &pop("ecx"); # clear variable from
-
&function_end($name);
}
@@ -221,17 +137,13 @@
{
local($name)=@_;
- &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+ &function_begin_B($name);
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
- &picmeup("eax","OPENSSL_ia32cap_P");
- &bt(&DWP(0,"eax"),26);
- &jnc(&label("mw_non_sse2"));
-
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
@@ -252,79 +164,7 @@
&movd("eax","mm1"); # return carry
&emms();
&ret();
- &set_label("mw_non_sse2",16);
}
-
- # function_begin prologue
- &push("ebp");
- &push("ebx");
- &push("esi");
- &push("edi");
-
- &comment("");
- $Low="eax";
- $High="edx";
- $a="ebx";
- $w="ecx";
- $r="edi";
- $c="esi";
- $num="ebp";
-
- &xor($c,$c); # clear carry
- &mov($r,&wparam(0)); #
- &mov($a,&wparam(1)); #
- &mov($num,&wparam(2)); #
- &mov($w,&wparam(3)); #
-
- &and($num,0xfffffff8); # num / 8
- &jz(&label("mw_finish"));
-
- &set_label("mw_loop",0);
- for ($i=0; $i<32; $i+=4)
- {
- &comment("Round $i");
-
- &mov("eax",&DWP($i,$a,"",0)); # *a
- &mul($w); # *a * w
- &add("eax",$c); # L(t)+=c
- # XXX
-
- &adc("edx",0); # H(t)+=carry
- &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
-
- &mov($c,"edx"); # c= H(t);
- }
-
- &comment("");
- &add($a,32);
- &add($r,32);
- &sub($num,8);
- &jz(&label("mw_finish"));
- &jmp(&label("mw_loop"));
-
- &set_label("mw_finish",0);
- &mov($num,&wparam(2)); # get num
- &and($num,7);
- &jnz(&label("mw_finish2"));
- &jmp(&label("mw_end"));
-
- &set_label("mw_finish2",1);
- for ($i=0; $i<7; $i++)
- {
- &comment("Tail Round $i");
- &mov("eax",&DWP($i*4,$a,"",0));# *a
- &mul($w); # *a * w
- &add("eax",$c); # L(t)+=c
- # XXX
- &adc("edx",0); # H(t)+=carry
- &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
- &mov($c,"edx"); # c= H(t);
- &dec($num) if ($i != 7-1);
- &jz(&label("mw_end")) if ($i != 7-1);
- }
- &set_label("mw_end",0);
- &mov("eax",$c);
-
&function_end($name);
}
@@ -332,17 +172,13 @@
{
local($name)=@_;
- &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+ &function_begin_B($name);
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
- &picmeup("eax","OPENSSL_ia32cap_P");
- &bt(&DWP(0,"eax"),26);
- &jnc(&label("sqr_non_sse2"));
-
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
@@ -358,62 +194,7 @@
&emms();
&ret();
- &set_label("sqr_non_sse2",16);
}
-
- # function_begin prologue
- &push("ebp");
- &push("ebx");
- &push("esi");
- &push("edi");
-
- &comment("");
- $r="esi";
- $a="edi";
- $num="ebx";
-
- &mov($r,&wparam(0)); #
- &mov($a,&wparam(1)); #
- &mov($num,&wparam(2)); #
-
- &and($num,0xfffffff8); # num / 8
- &jz(&label("sw_finish"));
-
- &set_label("sw_loop",0);
- for ($i=0; $i<32; $i+=4)
- {
- &comment("Round $i");
- &mov("eax",&DWP($i,$a,"",0)); # *a
- # XXX
- &mul("eax"); # *a * *a
- &mov(&DWP($i*2,$r,"",0),"eax"); #
- &mov(&DWP($i*2+4,$r,"",0),"edx");#
- }
-
- &comment("");
- &add($a,32);
- &add($r,64);
- &sub($num,8);
- &jnz(&label("sw_loop"));
-
- &set_label("sw_finish",0);
- &mov($num,&wparam(2)); # get num
- &and($num,7);
- &jz(&label("sw_end"));
-
- for ($i=0; $i<7; $i++)
- {
- &comment("Tail Round $i");
- &mov("eax",&DWP($i*4,$a,"",0)); # *a
- # XXX
- &mul("eax"); # *a * *a
- &mov(&DWP($i*8,$r,"",0),"eax"); #
- &dec($num) if ($i != 7-1);
- &mov(&DWP($i*8+4,$r,"",0),"edx");
- &jz(&label("sw_end")) if ($i != 7-1);
- }
- &set_label("sw_end",0);
-
&function_end($name);
}
diff --git a/crypto/fipsmodule/bn/asm/x86-mont.pl b/crypto/fipsmodule/bn/asm/x86-mont.pl
index c097574..7b7cca7 100755
--- a/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -44,8 +44,6 @@
$sse2=1;
-&external_label("OPENSSL_ia32cap_P") if ($sse2);
-
&function_begin("bn_mul_mont");
$i="edx";
@@ -146,10 +144,6 @@
$temp="mm6";
$mask="mm7";
- &picmeup("eax","OPENSSL_ia32cap_P");
- &bt (&DWP(0,"eax"),26);
- &jnc (&label("non_sse2"));
-
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
@@ -291,298 +285,6 @@
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
-
-&set_label("non_sse2",16);
-}
-
-if (0) {
- &mov ("esp",$_sp);
- &xor ("eax","eax"); # signal "not fast enough [yet]"
- &jmp (&label("just_leave"));
- # While the below code provides competitive performance for
- # all key lengths on modern Intel cores, it's still more
- # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
- # means compared to the original integer-only assembler.
- # 512-bit RSA sign is better by ~40%, but that's about all
- # one can say about all CPUs...
-} else {
-$inp="esi"; # integer path uses these registers differently
-$word="edi";
-$carry="ebp";
-
- &mov ($inp,$_ap);
- &lea ($carry,&DWP(1,$num));
- &mov ($word,$_bp);
- &xor ($j,$j); # j=0
- &mov ("edx",$inp);
- &and ($carry,1); # see if num is even
- &sub ("edx",$word); # see if ap==bp
- &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
- &or ($carry,"edx");
- &mov ($word,&DWP(0,$word)); # bp[0]
- &jz (&label("bn_sqr_mont"));
- &mov ($_bpend,"eax");
- &mov ("eax",&DWP(0,$inp));
- &xor ("edx","edx");
-
-&set_label("mull",16);
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*bp[0]
- &add ($carry,"eax");
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
- &cmp ($j,$num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("mull"));
-
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*bp[0]
- &mov ($word,$_n0);
- &add ("eax",$carry);
- &mov ($inp,$_np);
- &adc ("edx",0);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
- &xor ($j,$j);
- &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
- &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
-
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &adc ("edx",0);
- &inc ($j);
-
- &jmp (&label("2ndmadd"));
-
-&set_label("1stmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*bp[i]
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("1stmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*bp[i]
- &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &mov ($word,$_n0);
- &adc ("edx",0);
- &mov ($inp,$_np);
- &add ($carry,"eax");
- &adc ("edx",0);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &xor ($j,$j);
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
- &adc ($j,0);
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
- &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &adc ("edx",0);
- &mov ($j,1);
-
-&set_label("2ndmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
- &jl (&label("2ndmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &adc ("edx",0);
- &add ($carry,"eax");
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
-
- &xor ("eax","eax");
- &mov ($j,$_bp); # &bp[i]
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
- &lea ($j,&DWP(4,$j));
- &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
- &cmp ($j,$_bpend);
- &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
- &je (&label("common_tail"));
-
- &mov ($word,&DWP(0,$j)); # bp[i+1]
- &mov ($inp,$_ap);
- &mov ($_bp,$j); # &bp[++i]
- &xor ($j,$j);
- &xor ("edx","edx");
- &mov ("eax",&DWP(0,$inp));
- &jmp (&label("1stmadd"));
-
-&set_label("bn_sqr_mont",16);
-$sbit=$num;
- &mov ($_num,$num);
- &mov ($_bp,$j); # i=0
-
- &mov ("eax",$word); # ap[0]
- &mul ($word); # ap[0]*ap[0]
- &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
- &mov ($sbit,"edx");
- &shr ("edx",1);
- &and ($sbit,1);
- &inc ($j);
-&set_label("sqr",16);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*ap[0]
- &add ("eax",$carry);
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &lea ($carry,&DWP(0,$sbit,"eax",2));
- &shr ("eax",31);
- &cmp ($j,$_num);
- &mov ($sbit,"eax");
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("sqr"));
-
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*ap[0]
- &add ("eax",$carry);
- &mov ($word,$_n0);
- &adc ("edx",0);
- &mov ($inp,$_np);
- &lea ($carry,&DWP(0,$sbit,"eax",2));
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
- &shr ("eax",31);
- &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
-
- &lea ($carry,&DWP(0,"eax","edx",2));
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &shr ("edx",31);
- &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
- &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ($num,$j);
- &adc ("edx",0);
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &mov ($j,1);
-
-&set_label("3rdmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
-
- &mov ($carry,"edx");
- &mul ($word); # np[j+1]*m
- &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
- &lea ($j,&DWP(2,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("3rdmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &adc ("edx",0);
- &add ($carry,"eax");
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
-
- &mov ($j,$_bp); # i
- &xor ("eax","eax");
- &mov ($inp,$_ap);
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
- &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
- &cmp ($j,$num);
- &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
- &je (&label("common_tail"));
-
- &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
- &lea ($j,&DWP(1,$j));
- &mov ("eax",$word);
- &mov ($_bp,$j); # ++i
- &mul ($word); # ap[i]*ap[i]
- &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
- &adc ("edx",0);
- &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
- &xor ($carry,$carry);
- &cmp ($j,$num);
- &lea ($j,&DWP(1,$j));
- &je (&label("sqrlast"));
-
- &mov ($sbit,"edx"); # zaps $num
- &shr ("edx",1);
- &and ($sbit,1);
-&set_label("sqradd",16);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*ap[i]
- &add ("eax",$carry);
- &lea ($carry,&DWP(0,"eax","eax"));
- &adc ("edx",0);
- &shr ("eax",31);
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("eax",0);
- &add ($carry,$sbit);
- &adc ("eax",0);
- &cmp ($j,$_num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &mov ($sbit,"eax");
- &jle (&label("sqradd"));
-
- &mov ($carry,"edx");
- &add ("edx","edx");
- &shr ($carry,31);
- &add ("edx",$sbit);
- &adc ($carry,0);
-&set_label("sqrlast");
- &mov ($word,$_n0);
- &mov ($inp,$_np);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &adc ($carry,0);
- &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
- &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &lea ($num,&DWP(-1,$j));
- &adc ("edx",0);
- &mov ($j,1);
- &mov ("eax",&DWP(4,$inp)); # np[1]
-
- &jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
diff --git a/crypto/fipsmodule/sha/asm/sha512-586.pl b/crypto/fipsmodule/sha/asm/sha512-586.pl
index b288776..67ad8a3 100644
--- a/crypto/fipsmodule/sha/asm/sha512-586.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-586.pl
@@ -315,9 +315,6 @@
if ($sse2) {
&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
&mov ("ecx",&DWP(0,"edx"));
- &test ("ecx",1<<26);
- &jz (&label("loop_x86"));
-
&mov ("edx",&DWP(4,"edx"));
# load ctx->h[0-7]
@@ -688,149 +685,6 @@
}
&function_end_A();
}
-&set_label("loop_x86",16);
- # copy input block to stack reversing byte and qword order
- for ($i=0;$i<8;$i++) {
- &mov ("eax",&DWP($i*16+0,"edi"));
- &mov ("ebx",&DWP($i*16+4,"edi"));
- &mov ("ecx",&DWP($i*16+8,"edi"));
- &mov ("edx",&DWP($i*16+12,"edi"));
- &bswap ("eax");
- &bswap ("ebx");
- &bswap ("ecx");
- &bswap ("edx");
- &push ("eax");
- &push ("ebx");
- &push ("ecx");
- &push ("edx");
- }
- &add ("edi",128);
- &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
- &mov (&DWP(8*(9+16)+4,"esp"),"edi");
-
- # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
- &lea ("edi",&DWP(8,"esp"));
- &mov ("ecx",16);
- &data_word(0xA5F3F689); # rep movsd
-
-&set_label("00_15_x86",16);
- &BODY_00_15_x86();
-
- &cmp (&LB("edx"),0x94);
- &jne (&label("00_15_x86"));
-
-&set_label("16_79_x86",16);
- #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
- # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
- # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
- &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
- &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
- &mov ("esi","ecx");
-
- &shr ("ecx",1); # lo>>1
- &mov ("edi","edx");
- &shr ("edx",1); # hi>>1
- &mov ("eax","ecx");
- &shl ("esi",24); # lo<<24
- &mov ("ebx","edx");
- &shl ("edi",24); # hi<<24
- &xor ("ebx","esi");
-
- &shr ("ecx",7-1); # lo>>7
- &xor ("eax","edi");
- &shr ("edx",7-1); # hi>>7
- &xor ("eax","ecx");
- &shl ("esi",31-24); # lo<<31
- &xor ("ebx","edx");
- &shl ("edi",25-24); # hi<<25
- &xor ("ebx","esi");
-
- &shr ("ecx",8-7); # lo>>8
- &xor ("eax","edi");
- &shr ("edx",8-7); # hi>>8
- &xor ("eax","ecx");
- &shl ("edi",31-25); # hi<<31
- &xor ("ebx","edx");
- &xor ("eax","edi"); # T1 = sigma0(X[-15])
-
- &mov (&DWP(0,"esp"),"eax");
- &mov (&DWP(4,"esp"),"ebx"); # put T1 away
-
- #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
- # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
- # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
- &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
- &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
- &mov ("esi","ecx");
-
- &shr ("ecx",6); # lo>>6
- &mov ("edi","edx");
- &shr ("edx",6); # hi>>6
- &mov ("eax","ecx");
- &shl ("esi",3); # lo<<3
- &mov ("ebx","edx");
- &shl ("edi",3); # hi<<3
- &xor ("eax","esi");
-
- &shr ("ecx",19-6); # lo>>19
- &xor ("ebx","edi");
- &shr ("edx",19-6); # hi>>19
- &xor ("eax","ecx");
- &shl ("esi",13-3); # lo<<13
- &xor ("ebx","edx");
- &shl ("edi",13-3); # hi<<13
- &xor ("ebx","esi");
-
- &shr ("ecx",29-19); # lo>>29
- &xor ("eax","edi");
- &shr ("edx",29-19); # hi>>29
- &xor ("ebx","ecx");
- &shl ("edi",26-13); # hi<<26
- &xor ("eax","edx");
- &xor ("eax","edi"); # sigma1(X[-2])
-
- &mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
- &mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
- &add ("eax",&DWP(0,"esp"));
- &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
- &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
- &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
- &add ("eax","ecx");
- &adc ("ebx","edx"); # T1 += X[-16]
- &add ("eax","esi");
- &adc ("ebx","edi"); # T1 += X[-7]
- &mov (&DWP(8*(9+15)+0,"esp"),"eax");
- &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
-
- &BODY_00_15_x86();
-
- &cmp (&LB("edx"),0x17);
- &jne (&label("16_79_x86"));
-
- &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
- &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
- for($i=0;$i<4;$i++) {
- &mov ("eax",&DWP($i*16+0,"esi"));
- &mov ("ebx",&DWP($i*16+4,"esi"));
- &mov ("ecx",&DWP($i*16+8,"esi"));
- &mov ("edx",&DWP($i*16+12,"esi"));
- &add ("eax",&DWP(8+($i*16)+0,"esp"));
- &adc ("ebx",&DWP(8+($i*16)+4,"esp"));
- &mov (&DWP($i*16+0,"esi"),"eax");
- &mov (&DWP($i*16+4,"esi"),"ebx");
- &add ("ecx",&DWP(8+($i*16)+8,"esp"));
- &adc ("edx",&DWP(8+($i*16)+12,"esp"));
- &mov (&DWP($i*16+8,"esi"),"ecx");
- &mov (&DWP($i*16+12,"esi"),"edx");
- }
- &add ("esp",8*(9+16+80)); # destroy frame
- &sub ($K512,8*80); # rewind K
-
- &cmp ("edi",&DWP(8,"esp")); # are we done yet?
- &jb (&label("loop_x86"));
-
- &mov ("esp",&DWP(12,"esp")); # restore sp
-&function_end_A();
&set_label("K512",64); # Yes! I keep it in the code segment!
&data_word(0xd728ae22,0x428a2f98); # u64
diff --git a/gen/bcm/bn-586-apple.S b/gen/bcm/bn-586-apple.S
index 93513d0..f483ef1 100644
--- a/gen/bcm/bn-586-apple.S
+++ b/gen/bcm/bn-586-apple.S
@@ -10,20 +10,14 @@
.align 4
_bn_mul_add_words:
L_bn_mul_add_words_begin:
- call L000PIC_me_up
-L000PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L001maw_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
movd 16(%esp),%mm0
pxor %mm1,%mm1
- jmp L002maw_sse2_entry
+ jmp L000maw_sse2_entry
.align 4,0x90
-L003maw_sse2_unrolled:
+L001maw_sse2_unrolled:
movd (%eax),%mm3
paddq %mm3,%mm1
movd (%edx),%mm2
@@ -83,12 +77,12 @@
leal 32(%eax),%eax
psrlq $32,%mm1
subl $8,%ecx
- jz L004maw_sse2_exit
-L002maw_sse2_entry:
+ jz L002maw_sse2_exit
+L000maw_sse2_entry:
testl $4294967288,%ecx
- jnz L003maw_sse2_unrolled
+ jnz L001maw_sse2_unrolled
.align 2,0x90
-L005maw_sse2_loop:
+L003maw_sse2_loop:
movd (%edx),%mm2
movd (%eax),%mm3
pmuludq %mm0,%mm2
@@ -99,189 +93,11 @@
subl $1,%ecx
psrlq $32,%mm1
leal 4(%eax),%eax
- jnz L005maw_sse2_loop
-L004maw_sse2_exit:
+ jnz L003maw_sse2_loop
+L002maw_sse2_exit:
movd %mm1,%eax
emms
ret
-.align 4,0x90
-L001maw_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- xorl %esi,%esi
- movl 20(%esp),%edi
- movl 28(%esp),%ecx
- movl 24(%esp),%ebx
- andl $4294967288,%ecx
- movl 32(%esp),%ebp
- pushl %ecx
- jz L006maw_finish
-.align 4,0x90
-L007maw_loop:
- # Round 0
- movl (%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl (%edi),%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
- # Round 4
- movl 4(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 4(%edi),%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
- # Round 8
- movl 8(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 8(%edi),%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
- # Round 12
- movl 12(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 12(%edi),%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
- # Round 16
- movl 16(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 16(%edi),%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
- # Round 20
- movl 20(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 20(%edi),%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
- # Round 24
- movl 24(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 24(%edi),%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
- # Round 28
- movl 28(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 28(%edi),%eax
- adcl $0,%edx
- movl %eax,28(%edi)
- movl %edx,%esi
-
- subl $8,%ecx
- leal 32(%ebx),%ebx
- leal 32(%edi),%edi
- jnz L007maw_loop
-L006maw_finish:
- movl 32(%esp),%ecx
- andl $7,%ecx
- jnz L008maw_finish2
- jmp L009maw_end
-L008maw_finish2:
- # Tail Round 0
- movl (%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl (%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 1
- movl 4(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 4(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,4(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 2
- movl 8(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 8(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,8(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 3
- movl 12(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 12(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,12(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 4
- movl 16(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 16(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,16(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 5
- movl 20(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 20(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,20(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 6
- movl 24(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 24(%edi),%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-L009maw_end:
- movl %esi,%eax
- popl %ecx
popl %edi
popl %esi
popl %ebx
@@ -292,19 +108,13 @@
.align 4
_bn_mul_words:
L_bn_mul_words_begin:
- call L010PIC_me_up
-L010PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L011mw_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
movd 16(%esp),%mm0
pxor %mm1,%mm1
.align 4,0x90
-L012mw_sse2_loop:
+L004mw_sse2_loop:
movd (%edx),%mm2
pmuludq %mm0,%mm2
leal 4(%edx),%edx
@@ -313,156 +123,10 @@
subl $1,%ecx
psrlq $32,%mm1
leal 4(%eax),%eax
- jnz L012mw_sse2_loop
+ jnz L004mw_sse2_loop
movd %mm1,%eax
emms
ret
-.align 4,0x90
-L011mw_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- xorl %esi,%esi
- movl 20(%esp),%edi
- movl 24(%esp),%ebx
- movl 28(%esp),%ebp
- movl 32(%esp),%ecx
- andl $4294967288,%ebp
- jz L013mw_finish
-L014mw_loop:
- # Round 0
- movl (%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
- # Round 4
- movl 4(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
- # Round 8
- movl 8(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
- # Round 12
- movl 12(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
- # Round 16
- movl 16(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
- # Round 20
- movl 20(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
- # Round 24
- movl 24(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
- # Round 28
- movl 28(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,28(%edi)
- movl %edx,%esi
-
- addl $32,%ebx
- addl $32,%edi
- subl $8,%ebp
- jz L013mw_finish
- jmp L014mw_loop
-L013mw_finish:
- movl 28(%esp),%ebp
- andl $7,%ebp
- jnz L015mw_finish2
- jmp L016mw_end
-L015mw_finish2:
- # Tail Round 0
- movl (%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 1
- movl 4(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 2
- movl 8(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 3
- movl 12(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 4
- movl 16(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 5
- movl 20(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 6
- movl 24(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-L016mw_end:
- movl %esi,%eax
popl %edi
popl %esi
popl %ebx
@@ -473,136 +137,20 @@
.align 4
_bn_sqr_words:
L_bn_sqr_words_begin:
- call L017PIC_me_up
-L017PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L018sqr_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
.align 4,0x90
-L019sqr_sse2_loop:
+L005sqr_sse2_loop:
movd (%edx),%mm0
pmuludq %mm0,%mm0
leal 4(%edx),%edx
movq %mm0,(%eax)
subl $1,%ecx
leal 8(%eax),%eax
- jnz L019sqr_sse2_loop
+ jnz L005sqr_sse2_loop
emms
ret
-.align 4,0x90
-L018sqr_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%ebx
- andl $4294967288,%ebx
- jz L020sw_finish
-L021sw_loop:
- # Round 0
- movl (%edi),%eax
- mull %eax
- movl %eax,(%esi)
- movl %edx,4(%esi)
- # Round 4
- movl 4(%edi),%eax
- mull %eax
- movl %eax,8(%esi)
- movl %edx,12(%esi)
- # Round 8
- movl 8(%edi),%eax
- mull %eax
- movl %eax,16(%esi)
- movl %edx,20(%esi)
- # Round 12
- movl 12(%edi),%eax
- mull %eax
- movl %eax,24(%esi)
- movl %edx,28(%esi)
- # Round 16
- movl 16(%edi),%eax
- mull %eax
- movl %eax,32(%esi)
- movl %edx,36(%esi)
- # Round 20
- movl 20(%edi),%eax
- mull %eax
- movl %eax,40(%esi)
- movl %edx,44(%esi)
- # Round 24
- movl 24(%edi),%eax
- mull %eax
- movl %eax,48(%esi)
- movl %edx,52(%esi)
- # Round 28
- movl 28(%edi),%eax
- mull %eax
- movl %eax,56(%esi)
- movl %edx,60(%esi)
-
- addl $32,%edi
- addl $64,%esi
- subl $8,%ebx
- jnz L021sw_loop
-L020sw_finish:
- movl 28(%esp),%ebx
- andl $7,%ebx
- jz L022sw_end
- # Tail Round 0
- movl (%edi),%eax
- mull %eax
- movl %eax,(%esi)
- decl %ebx
- movl %edx,4(%esi)
- jz L022sw_end
- # Tail Round 1
- movl 4(%edi),%eax
- mull %eax
- movl %eax,8(%esi)
- decl %ebx
- movl %edx,12(%esi)
- jz L022sw_end
- # Tail Round 2
- movl 8(%edi),%eax
- mull %eax
- movl %eax,16(%esi)
- decl %ebx
- movl %edx,20(%esi)
- jz L022sw_end
- # Tail Round 3
- movl 12(%edi),%eax
- mull %eax
- movl %eax,24(%esi)
- decl %ebx
- movl %edx,28(%esi)
- jz L022sw_end
- # Tail Round 4
- movl 16(%edi),%eax
- mull %eax
- movl %eax,32(%esi)
- decl %ebx
- movl %edx,36(%esi)
- jz L022sw_end
- # Tail Round 5
- movl 20(%edi),%eax
- mull %eax
- movl %eax,40(%esi)
- decl %ebx
- movl %edx,44(%esi)
- jz L022sw_end
- # Tail Round 6
- movl 24(%edi),%eax
- mull %eax
- movl %eax,48(%esi)
- movl %edx,52(%esi)
-L022sw_end:
popl %edi
popl %esi
popl %ebx
@@ -634,8 +182,8 @@
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L023aw_finish
-L024aw_loop:
+ jz L006aw_finish
+L007aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -713,11 +261,11 @@
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L024aw_loop
-L023aw_finish:
+ jnz L007aw_loop
+L006aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L025aw_end
+ jz L008aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -728,7 +276,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L025aw_end
+ jz L008aw_end
# Tail Round 1
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -739,7 +287,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L025aw_end
+ jz L008aw_end
# Tail Round 2
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -750,7 +298,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L025aw_end
+ jz L008aw_end
# Tail Round 3
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -761,7 +309,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L025aw_end
+ jz L008aw_end
# Tail Round 4
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -772,7 +320,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L025aw_end
+ jz L008aw_end
# Tail Round 5
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -783,7 +331,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L025aw_end
+ jz L008aw_end
# Tail Round 6
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -793,7 +341,7 @@
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-L025aw_end:
+L008aw_end:
popl %edi
popl %esi
popl %ebx
@@ -815,8 +363,8 @@
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L026aw_finish
-L027aw_loop:
+ jz L009aw_finish
+L010aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -894,11 +442,11 @@
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L027aw_loop
-L026aw_finish:
+ jnz L010aw_loop
+L009aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L028aw_end
+ jz L011aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -909,7 +457,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L028aw_end
+ jz L011aw_end
# Tail Round 1
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -920,7 +468,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L028aw_end
+ jz L011aw_end
# Tail Round 2
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -931,7 +479,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L028aw_end
+ jz L011aw_end
# Tail Round 3
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -942,7 +490,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L028aw_end
+ jz L011aw_end
# Tail Round 4
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -953,7 +501,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L028aw_end
+ jz L011aw_end
# Tail Round 5
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -964,7 +512,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L028aw_end
+ jz L011aw_end
# Tail Round 6
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -974,14 +522,10 @@
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-L028aw_end:
+L011aw_end:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/bn-586-linux.S b/gen/bcm/bn-586-linux.S
index 311f22c..fb83b22 100644
--- a/gen/bcm/bn-586-linux.S
+++ b/gen/bcm/bn-586-linux.S
@@ -11,20 +11,14 @@
.align 16
bn_mul_add_words:
.L_bn_mul_add_words_begin:
- call .L000PIC_me_up
-.L000PIC_me_up:
- popl %eax
- leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc .L001maw_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
movd 16(%esp),%mm0
pxor %mm1,%mm1
- jmp .L002maw_sse2_entry
+ jmp .L000maw_sse2_entry
.align 16
-.L003maw_sse2_unrolled:
+.L001maw_sse2_unrolled:
movd (%eax),%mm3
paddq %mm3,%mm1
movd (%edx),%mm2
@@ -84,12 +78,12 @@
leal 32(%eax),%eax
psrlq $32,%mm1
subl $8,%ecx
- jz .L004maw_sse2_exit
-.L002maw_sse2_entry:
+ jz .L002maw_sse2_exit
+.L000maw_sse2_entry:
testl $4294967288,%ecx
- jnz .L003maw_sse2_unrolled
+ jnz .L001maw_sse2_unrolled
.align 4
-.L005maw_sse2_loop:
+.L003maw_sse2_loop:
movd (%edx),%mm2
movd (%eax),%mm3
pmuludq %mm0,%mm2
@@ -100,189 +94,11 @@
subl $1,%ecx
psrlq $32,%mm1
leal 4(%eax),%eax
- jnz .L005maw_sse2_loop
-.L004maw_sse2_exit:
+ jnz .L003maw_sse2_loop
+.L002maw_sse2_exit:
movd %mm1,%eax
emms
ret
-.align 16
-.L001maw_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- xorl %esi,%esi
- movl 20(%esp),%edi
- movl 28(%esp),%ecx
- movl 24(%esp),%ebx
- andl $4294967288,%ecx
- movl 32(%esp),%ebp
- pushl %ecx
- jz .L006maw_finish
-.align 16
-.L007maw_loop:
-
- movl (%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl (%edi),%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
-
- movl 4(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 4(%edi),%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
-
- movl 8(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 8(%edi),%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
-
- movl 12(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 12(%edi),%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
-
- movl 16(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 16(%edi),%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
-
- movl 20(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 20(%edi),%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
-
- movl 24(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 24(%edi),%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-
- movl 28(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 28(%edi),%eax
- adcl $0,%edx
- movl %eax,28(%edi)
- movl %edx,%esi
-
- subl $8,%ecx
- leal 32(%ebx),%ebx
- leal 32(%edi),%edi
- jnz .L007maw_loop
-.L006maw_finish:
- movl 32(%esp),%ecx
- andl $7,%ecx
- jnz .L008maw_finish2
- jmp .L009maw_end
-.L008maw_finish2:
-
- movl (%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl (%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,(%edi)
- movl %edx,%esi
- jz .L009maw_end
-
- movl 4(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 4(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,4(%edi)
- movl %edx,%esi
- jz .L009maw_end
-
- movl 8(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 8(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,8(%edi)
- movl %edx,%esi
- jz .L009maw_end
-
- movl 12(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 12(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,12(%edi)
- movl %edx,%esi
- jz .L009maw_end
-
- movl 16(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 16(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,16(%edi)
- movl %edx,%esi
- jz .L009maw_end
-
- movl 20(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 20(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,20(%edi)
- movl %edx,%esi
- jz .L009maw_end
-
- movl 24(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 24(%edi),%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-.L009maw_end:
- movl %esi,%eax
- popl %ecx
popl %edi
popl %esi
popl %ebx
@@ -295,19 +111,13 @@
.align 16
bn_mul_words:
.L_bn_mul_words_begin:
- call .L010PIC_me_up
-.L010PIC_me_up:
- popl %eax
- leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc .L011mw_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
movd 16(%esp),%mm0
pxor %mm1,%mm1
.align 16
-.L012mw_sse2_loop:
+.L004mw_sse2_loop:
movd (%edx),%mm2
pmuludq %mm0,%mm2
leal 4(%edx),%edx
@@ -316,156 +126,10 @@
subl $1,%ecx
psrlq $32,%mm1
leal 4(%eax),%eax
- jnz .L012mw_sse2_loop
+ jnz .L004mw_sse2_loop
movd %mm1,%eax
emms
ret
-.align 16
-.L011mw_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- xorl %esi,%esi
- movl 20(%esp),%edi
- movl 24(%esp),%ebx
- movl 28(%esp),%ebp
- movl 32(%esp),%ecx
- andl $4294967288,%ebp
- jz .L013mw_finish
-.L014mw_loop:
-
- movl (%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
-
- movl 4(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
-
- movl 8(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
-
- movl 12(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
-
- movl 16(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
-
- movl 20(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
-
- movl 24(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-
- movl 28(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,28(%edi)
- movl %edx,%esi
-
- addl $32,%ebx
- addl $32,%edi
- subl $8,%ebp
- jz .L013mw_finish
- jmp .L014mw_loop
-.L013mw_finish:
- movl 28(%esp),%ebp
- andl $7,%ebp
- jnz .L015mw_finish2
- jmp .L016mw_end
-.L015mw_finish2:
-
- movl (%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
- decl %ebp
- jz .L016mw_end
-
- movl 4(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
- decl %ebp
- jz .L016mw_end
-
- movl 8(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
- decl %ebp
- jz .L016mw_end
-
- movl 12(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
- decl %ebp
- jz .L016mw_end
-
- movl 16(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
- decl %ebp
- jz .L016mw_end
-
- movl 20(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
- decl %ebp
- jz .L016mw_end
-
- movl 24(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-.L016mw_end:
- movl %esi,%eax
popl %edi
popl %esi
popl %ebx
@@ -478,136 +142,20 @@
.align 16
bn_sqr_words:
.L_bn_sqr_words_begin:
- call .L017PIC_me_up
-.L017PIC_me_up:
- popl %eax
- leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc .L018sqr_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
.align 16
-.L019sqr_sse2_loop:
+.L005sqr_sse2_loop:
movd (%edx),%mm0
pmuludq %mm0,%mm0
leal 4(%edx),%edx
movq %mm0,(%eax)
subl $1,%ecx
leal 8(%eax),%eax
- jnz .L019sqr_sse2_loop
+ jnz .L005sqr_sse2_loop
emms
ret
-.align 16
-.L018sqr_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%ebx
- andl $4294967288,%ebx
- jz .L020sw_finish
-.L021sw_loop:
-
- movl (%edi),%eax
- mull %eax
- movl %eax,(%esi)
- movl %edx,4(%esi)
-
- movl 4(%edi),%eax
- mull %eax
- movl %eax,8(%esi)
- movl %edx,12(%esi)
-
- movl 8(%edi),%eax
- mull %eax
- movl %eax,16(%esi)
- movl %edx,20(%esi)
-
- movl 12(%edi),%eax
- mull %eax
- movl %eax,24(%esi)
- movl %edx,28(%esi)
-
- movl 16(%edi),%eax
- mull %eax
- movl %eax,32(%esi)
- movl %edx,36(%esi)
-
- movl 20(%edi),%eax
- mull %eax
- movl %eax,40(%esi)
- movl %edx,44(%esi)
-
- movl 24(%edi),%eax
- mull %eax
- movl %eax,48(%esi)
- movl %edx,52(%esi)
-
- movl 28(%edi),%eax
- mull %eax
- movl %eax,56(%esi)
- movl %edx,60(%esi)
-
- addl $32,%edi
- addl $64,%esi
- subl $8,%ebx
- jnz .L021sw_loop
-.L020sw_finish:
- movl 28(%esp),%ebx
- andl $7,%ebx
- jz .L022sw_end
-
- movl (%edi),%eax
- mull %eax
- movl %eax,(%esi)
- decl %ebx
- movl %edx,4(%esi)
- jz .L022sw_end
-
- movl 4(%edi),%eax
- mull %eax
- movl %eax,8(%esi)
- decl %ebx
- movl %edx,12(%esi)
- jz .L022sw_end
-
- movl 8(%edi),%eax
- mull %eax
- movl %eax,16(%esi)
- decl %ebx
- movl %edx,20(%esi)
- jz .L022sw_end
-
- movl 12(%edi),%eax
- mull %eax
- movl %eax,24(%esi)
- decl %ebx
- movl %edx,28(%esi)
- jz .L022sw_end
-
- movl 16(%edi),%eax
- mull %eax
- movl %eax,32(%esi)
- decl %ebx
- movl %edx,36(%esi)
- jz .L022sw_end
-
- movl 20(%edi),%eax
- mull %eax
- movl %eax,40(%esi)
- decl %ebx
- movl %edx,44(%esi)
- jz .L022sw_end
-
- movl 24(%edi),%eax
- mull %eax
- movl %eax,48(%esi)
- movl %edx,52(%esi)
-.L022sw_end:
popl %edi
popl %esi
popl %ebx
@@ -643,8 +191,8 @@
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L023aw_finish
-.L024aw_loop:
+ jz .L006aw_finish
+.L007aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -722,11 +270,11 @@
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L024aw_loop
-.L023aw_finish:
+ jnz .L007aw_loop
+.L006aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L025aw_end
+ jz .L008aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -737,7 +285,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L025aw_end
+ jz .L008aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -748,7 +296,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L025aw_end
+ jz .L008aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -759,7 +307,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L025aw_end
+ jz .L008aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -770,7 +318,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L025aw_end
+ jz .L008aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -781,7 +329,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L025aw_end
+ jz .L008aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -792,7 +340,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L025aw_end
+ jz .L008aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -802,7 +350,7 @@
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L025aw_end:
+.L008aw_end:
popl %edi
popl %esi
popl %ebx
@@ -826,8 +374,8 @@
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L026aw_finish
-.L027aw_loop:
+ jz .L009aw_finish
+.L010aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -905,11 +453,11 @@
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L027aw_loop
-.L026aw_finish:
+ jnz .L010aw_loop
+.L009aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L028aw_end
+ jz .L011aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -920,7 +468,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L028aw_end
+ jz .L011aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -931,7 +479,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L028aw_end
+ jz .L011aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -942,7 +490,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L028aw_end
+ jz .L011aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -953,7 +501,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L028aw_end
+ jz .L011aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -964,7 +512,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L028aw_end
+ jz .L011aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -975,7 +523,7 @@
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L028aw_end
+ jz .L011aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -985,7 +533,7 @@
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L028aw_end:
+.L011aw_end:
popl %edi
popl %esi
popl %ebx
diff --git a/gen/bcm/bn-586-win.asm b/gen/bcm/bn-586-win.asm
index f7ddfa8..09aafb8 100644
--- a/gen/bcm/bn-586-win.asm
+++ b/gen/bcm/bn-586-win.asm
@@ -13,22 +13,18 @@
%else
section .text code
%endif
-;extern _OPENSSL_ia32cap_P
global _bn_mul_add_words
align 16
_bn_mul_add_words:
L$_bn_mul_add_words_begin:
- lea eax,[_OPENSSL_ia32cap_P]
- bt DWORD [eax],26
- jnc NEAR L$000maw_non_sse2
mov eax,DWORD [4+esp]
mov edx,DWORD [8+esp]
mov ecx,DWORD [12+esp]
movd mm0,DWORD [16+esp]
pxor mm1,mm1
- jmp NEAR L$001maw_sse2_entry
+ jmp NEAR L$000maw_sse2_entry
align 16
-L$002maw_sse2_unrolled:
+L$001maw_sse2_unrolled:
movd mm3,DWORD [eax]
paddq mm1,mm3
movd mm2,DWORD [edx]
@@ -88,12 +84,12 @@
lea eax,[32+eax]
psrlq mm1,32
sub ecx,8
- jz NEAR L$003maw_sse2_exit
-L$001maw_sse2_entry:
+ jz NEAR L$002maw_sse2_exit
+L$000maw_sse2_entry:
test ecx,4294967288
- jnz NEAR L$002maw_sse2_unrolled
+ jnz NEAR L$001maw_sse2_unrolled
align 4
-L$004maw_sse2_loop:
+L$003maw_sse2_loop:
movd mm2,DWORD [edx]
movd mm3,DWORD [eax]
pmuludq mm2,mm0
@@ -104,189 +100,11 @@
sub ecx,1
psrlq mm1,32
lea eax,[4+eax]
- jnz NEAR L$004maw_sse2_loop
-L$003maw_sse2_exit:
+ jnz NEAR L$003maw_sse2_loop
+L$002maw_sse2_exit:
movd eax,mm1
emms
ret
-align 16
-L$000maw_non_sse2:
- push ebp
- push ebx
- push esi
- push edi
- ;
- xor esi,esi
- mov edi,DWORD [20+esp]
- mov ecx,DWORD [28+esp]
- mov ebx,DWORD [24+esp]
- and ecx,4294967288
- mov ebp,DWORD [32+esp]
- push ecx
- jz NEAR L$005maw_finish
-align 16
-L$006maw_loop:
- ; Round 0
- mov eax,DWORD [ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [edi]
- adc edx,0
- mov DWORD [edi],eax
- mov esi,edx
- ; Round 4
- mov eax,DWORD [4+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [4+edi]
- adc edx,0
- mov DWORD [4+edi],eax
- mov esi,edx
- ; Round 8
- mov eax,DWORD [8+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [8+edi]
- adc edx,0
- mov DWORD [8+edi],eax
- mov esi,edx
- ; Round 12
- mov eax,DWORD [12+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [12+edi]
- adc edx,0
- mov DWORD [12+edi],eax
- mov esi,edx
- ; Round 16
- mov eax,DWORD [16+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [16+edi]
- adc edx,0
- mov DWORD [16+edi],eax
- mov esi,edx
- ; Round 20
- mov eax,DWORD [20+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [20+edi]
- adc edx,0
- mov DWORD [20+edi],eax
- mov esi,edx
- ; Round 24
- mov eax,DWORD [24+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [24+edi]
- adc edx,0
- mov DWORD [24+edi],eax
- mov esi,edx
- ; Round 28
- mov eax,DWORD [28+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [28+edi]
- adc edx,0
- mov DWORD [28+edi],eax
- mov esi,edx
- ;
- sub ecx,8
- lea ebx,[32+ebx]
- lea edi,[32+edi]
- jnz NEAR L$006maw_loop
-L$005maw_finish:
- mov ecx,DWORD [32+esp]
- and ecx,7
- jnz NEAR L$007maw_finish2
- jmp NEAR L$008maw_end
-L$007maw_finish2:
- ; Tail Round 0
- mov eax,DWORD [ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [edi]
- adc edx,0
- dec ecx
- mov DWORD [edi],eax
- mov esi,edx
- jz NEAR L$008maw_end
- ; Tail Round 1
- mov eax,DWORD [4+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [4+edi]
- adc edx,0
- dec ecx
- mov DWORD [4+edi],eax
- mov esi,edx
- jz NEAR L$008maw_end
- ; Tail Round 2
- mov eax,DWORD [8+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [8+edi]
- adc edx,0
- dec ecx
- mov DWORD [8+edi],eax
- mov esi,edx
- jz NEAR L$008maw_end
- ; Tail Round 3
- mov eax,DWORD [12+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [12+edi]
- adc edx,0
- dec ecx
- mov DWORD [12+edi],eax
- mov esi,edx
- jz NEAR L$008maw_end
- ; Tail Round 4
- mov eax,DWORD [16+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [16+edi]
- adc edx,0
- dec ecx
- mov DWORD [16+edi],eax
- mov esi,edx
- jz NEAR L$008maw_end
- ; Tail Round 5
- mov eax,DWORD [20+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [20+edi]
- adc edx,0
- dec ecx
- mov DWORD [20+edi],eax
- mov esi,edx
- jz NEAR L$008maw_end
- ; Tail Round 6
- mov eax,DWORD [24+ebx]
- mul ebp
- add eax,esi
- adc edx,0
- add eax,DWORD [24+edi]
- adc edx,0
- mov DWORD [24+edi],eax
- mov esi,edx
-L$008maw_end:
- mov eax,esi
- pop ecx
pop edi
pop esi
pop ebx
@@ -296,16 +114,13 @@
align 16
_bn_mul_words:
L$_bn_mul_words_begin:
- lea eax,[_OPENSSL_ia32cap_P]
- bt DWORD [eax],26
- jnc NEAR L$009mw_non_sse2
mov eax,DWORD [4+esp]
mov edx,DWORD [8+esp]
mov ecx,DWORD [12+esp]
movd mm0,DWORD [16+esp]
pxor mm1,mm1
align 16
-L$010mw_sse2_loop:
+L$004mw_sse2_loop:
movd mm2,DWORD [edx]
pmuludq mm2,mm0
lea edx,[4+edx]
@@ -314,156 +129,10 @@
sub ecx,1
psrlq mm1,32
lea eax,[4+eax]
- jnz NEAR L$010mw_sse2_loop
+ jnz NEAR L$004mw_sse2_loop
movd eax,mm1
emms
ret
-align 16
-L$009mw_non_sse2:
- push ebp
- push ebx
- push esi
- push edi
- ;
- xor esi,esi
- mov edi,DWORD [20+esp]
- mov ebx,DWORD [24+esp]
- mov ebp,DWORD [28+esp]
- mov ecx,DWORD [32+esp]
- and ebp,4294967288
- jz NEAR L$011mw_finish
-L$012mw_loop:
- ; Round 0
- mov eax,DWORD [ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [edi],eax
- mov esi,edx
- ; Round 4
- mov eax,DWORD [4+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [4+edi],eax
- mov esi,edx
- ; Round 8
- mov eax,DWORD [8+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [8+edi],eax
- mov esi,edx
- ; Round 12
- mov eax,DWORD [12+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [12+edi],eax
- mov esi,edx
- ; Round 16
- mov eax,DWORD [16+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [16+edi],eax
- mov esi,edx
- ; Round 20
- mov eax,DWORD [20+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [20+edi],eax
- mov esi,edx
- ; Round 24
- mov eax,DWORD [24+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [24+edi],eax
- mov esi,edx
- ; Round 28
- mov eax,DWORD [28+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [28+edi],eax
- mov esi,edx
- ;
- add ebx,32
- add edi,32
- sub ebp,8
- jz NEAR L$011mw_finish
- jmp NEAR L$012mw_loop
-L$011mw_finish:
- mov ebp,DWORD [28+esp]
- and ebp,7
- jnz NEAR L$013mw_finish2
- jmp NEAR L$014mw_end
-L$013mw_finish2:
- ; Tail Round 0
- mov eax,DWORD [ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [edi],eax
- mov esi,edx
- dec ebp
- jz NEAR L$014mw_end
- ; Tail Round 1
- mov eax,DWORD [4+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [4+edi],eax
- mov esi,edx
- dec ebp
- jz NEAR L$014mw_end
- ; Tail Round 2
- mov eax,DWORD [8+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [8+edi],eax
- mov esi,edx
- dec ebp
- jz NEAR L$014mw_end
- ; Tail Round 3
- mov eax,DWORD [12+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [12+edi],eax
- mov esi,edx
- dec ebp
- jz NEAR L$014mw_end
- ; Tail Round 4
- mov eax,DWORD [16+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [16+edi],eax
- mov esi,edx
- dec ebp
- jz NEAR L$014mw_end
- ; Tail Round 5
- mov eax,DWORD [20+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [20+edi],eax
- mov esi,edx
- dec ebp
- jz NEAR L$014mw_end
- ; Tail Round 6
- mov eax,DWORD [24+ebx]
- mul ecx
- add eax,esi
- adc edx,0
- mov DWORD [24+edi],eax
- mov esi,edx
-L$014mw_end:
- mov eax,esi
pop edi
pop esi
pop ebx
@@ -473,133 +142,20 @@
align 16
_bn_sqr_words:
L$_bn_sqr_words_begin:
- lea eax,[_OPENSSL_ia32cap_P]
- bt DWORD [eax],26
- jnc NEAR L$015sqr_non_sse2
mov eax,DWORD [4+esp]
mov edx,DWORD [8+esp]
mov ecx,DWORD [12+esp]
align 16
-L$016sqr_sse2_loop:
+L$005sqr_sse2_loop:
movd mm0,DWORD [edx]
pmuludq mm0,mm0
lea edx,[4+edx]
movq [eax],mm0
sub ecx,1
lea eax,[8+eax]
- jnz NEAR L$016sqr_sse2_loop
+ jnz NEAR L$005sqr_sse2_loop
emms
ret
-align 16
-L$015sqr_non_sse2:
- push ebp
- push ebx
- push esi
- push edi
- ;
- mov esi,DWORD [20+esp]
- mov edi,DWORD [24+esp]
- mov ebx,DWORD [28+esp]
- and ebx,4294967288
- jz NEAR L$017sw_finish
-L$018sw_loop:
- ; Round 0
- mov eax,DWORD [edi]
- mul eax
- mov DWORD [esi],eax
- mov DWORD [4+esi],edx
- ; Round 4
- mov eax,DWORD [4+edi]
- mul eax
- mov DWORD [8+esi],eax
- mov DWORD [12+esi],edx
- ; Round 8
- mov eax,DWORD [8+edi]
- mul eax
- mov DWORD [16+esi],eax
- mov DWORD [20+esi],edx
- ; Round 12
- mov eax,DWORD [12+edi]
- mul eax
- mov DWORD [24+esi],eax
- mov DWORD [28+esi],edx
- ; Round 16
- mov eax,DWORD [16+edi]
- mul eax
- mov DWORD [32+esi],eax
- mov DWORD [36+esi],edx
- ; Round 20
- mov eax,DWORD [20+edi]
- mul eax
- mov DWORD [40+esi],eax
- mov DWORD [44+esi],edx
- ; Round 24
- mov eax,DWORD [24+edi]
- mul eax
- mov DWORD [48+esi],eax
- mov DWORD [52+esi],edx
- ; Round 28
- mov eax,DWORD [28+edi]
- mul eax
- mov DWORD [56+esi],eax
- mov DWORD [60+esi],edx
- ;
- add edi,32
- add esi,64
- sub ebx,8
- jnz NEAR L$018sw_loop
-L$017sw_finish:
- mov ebx,DWORD [28+esp]
- and ebx,7
- jz NEAR L$019sw_end
- ; Tail Round 0
- mov eax,DWORD [edi]
- mul eax
- mov DWORD [esi],eax
- dec ebx
- mov DWORD [4+esi],edx
- jz NEAR L$019sw_end
- ; Tail Round 1
- mov eax,DWORD [4+edi]
- mul eax
- mov DWORD [8+esi],eax
- dec ebx
- mov DWORD [12+esi],edx
- jz NEAR L$019sw_end
- ; Tail Round 2
- mov eax,DWORD [8+edi]
- mul eax
- mov DWORD [16+esi],eax
- dec ebx
- mov DWORD [20+esi],edx
- jz NEAR L$019sw_end
- ; Tail Round 3
- mov eax,DWORD [12+edi]
- mul eax
- mov DWORD [24+esi],eax
- dec ebx
- mov DWORD [28+esi],edx
- jz NEAR L$019sw_end
- ; Tail Round 4
- mov eax,DWORD [16+edi]
- mul eax
- mov DWORD [32+esi],eax
- dec ebx
- mov DWORD [36+esi],edx
- jz NEAR L$019sw_end
- ; Tail Round 5
- mov eax,DWORD [20+edi]
- mul eax
- mov DWORD [40+esi],eax
- dec ebx
- mov DWORD [44+esi],edx
- jz NEAR L$019sw_end
- ; Tail Round 6
- mov eax,DWORD [24+edi]
- mul eax
- mov DWORD [48+esi],eax
- mov DWORD [52+esi],edx
-L$019sw_end:
pop edi
pop esi
pop ebx
@@ -629,8 +185,8 @@
mov ebp,DWORD [32+esp]
xor eax,eax
and ebp,4294967288
- jz NEAR L$020aw_finish
-L$021aw_loop:
+ jz NEAR L$006aw_finish
+L$007aw_loop:
; Round 0
mov ecx,DWORD [esi]
mov edx,DWORD [edi]
@@ -708,11 +264,11 @@
add edi,32
add ebx,32
sub ebp,8
- jnz NEAR L$021aw_loop
-L$020aw_finish:
+ jnz NEAR L$007aw_loop
+L$006aw_finish:
mov ebp,DWORD [32+esp]
and ebp,7
- jz NEAR L$022aw_end
+ jz NEAR L$008aw_end
; Tail Round 0
mov ecx,DWORD [esi]
mov edx,DWORD [edi]
@@ -723,7 +279,7 @@
adc eax,0
dec ebp
mov DWORD [ebx],ecx
- jz NEAR L$022aw_end
+ jz NEAR L$008aw_end
; Tail Round 1
mov ecx,DWORD [4+esi]
mov edx,DWORD [4+edi]
@@ -734,7 +290,7 @@
adc eax,0
dec ebp
mov DWORD [4+ebx],ecx
- jz NEAR L$022aw_end
+ jz NEAR L$008aw_end
; Tail Round 2
mov ecx,DWORD [8+esi]
mov edx,DWORD [8+edi]
@@ -745,7 +301,7 @@
adc eax,0
dec ebp
mov DWORD [8+ebx],ecx
- jz NEAR L$022aw_end
+ jz NEAR L$008aw_end
; Tail Round 3
mov ecx,DWORD [12+esi]
mov edx,DWORD [12+edi]
@@ -756,7 +312,7 @@
adc eax,0
dec ebp
mov DWORD [12+ebx],ecx
- jz NEAR L$022aw_end
+ jz NEAR L$008aw_end
; Tail Round 4
mov ecx,DWORD [16+esi]
mov edx,DWORD [16+edi]
@@ -767,7 +323,7 @@
adc eax,0
dec ebp
mov DWORD [16+ebx],ecx
- jz NEAR L$022aw_end
+ jz NEAR L$008aw_end
; Tail Round 5
mov ecx,DWORD [20+esi]
mov edx,DWORD [20+edi]
@@ -778,7 +334,7 @@
adc eax,0
dec ebp
mov DWORD [20+ebx],ecx
- jz NEAR L$022aw_end
+ jz NEAR L$008aw_end
; Tail Round 6
mov ecx,DWORD [24+esi]
mov edx,DWORD [24+edi]
@@ -788,7 +344,7 @@
add ecx,edx
adc eax,0
mov DWORD [24+ebx],ecx
-L$022aw_end:
+L$008aw_end:
pop edi
pop esi
pop ebx
@@ -809,8 +365,8 @@
mov ebp,DWORD [32+esp]
xor eax,eax
and ebp,4294967288
- jz NEAR L$023aw_finish
-L$024aw_loop:
+ jz NEAR L$009aw_finish
+L$010aw_loop:
; Round 0
mov ecx,DWORD [esi]
mov edx,DWORD [edi]
@@ -888,11 +444,11 @@
add edi,32
add ebx,32
sub ebp,8
- jnz NEAR L$024aw_loop
-L$023aw_finish:
+ jnz NEAR L$010aw_loop
+L$009aw_finish:
mov ebp,DWORD [32+esp]
and ebp,7
- jz NEAR L$025aw_end
+ jz NEAR L$011aw_end
; Tail Round 0
mov ecx,DWORD [esi]
mov edx,DWORD [edi]
@@ -903,7 +459,7 @@
adc eax,0
dec ebp
mov DWORD [ebx],ecx
- jz NEAR L$025aw_end
+ jz NEAR L$011aw_end
; Tail Round 1
mov ecx,DWORD [4+esi]
mov edx,DWORD [4+edi]
@@ -914,7 +470,7 @@
adc eax,0
dec ebp
mov DWORD [4+ebx],ecx
- jz NEAR L$025aw_end
+ jz NEAR L$011aw_end
; Tail Round 2
mov ecx,DWORD [8+esi]
mov edx,DWORD [8+edi]
@@ -925,7 +481,7 @@
adc eax,0
dec ebp
mov DWORD [8+ebx],ecx
- jz NEAR L$025aw_end
+ jz NEAR L$011aw_end
; Tail Round 3
mov ecx,DWORD [12+esi]
mov edx,DWORD [12+edi]
@@ -936,7 +492,7 @@
adc eax,0
dec ebp
mov DWORD [12+ebx],ecx
- jz NEAR L$025aw_end
+ jz NEAR L$011aw_end
; Tail Round 4
mov ecx,DWORD [16+esi]
mov edx,DWORD [16+edi]
@@ -947,7 +503,7 @@
adc eax,0
dec ebp
mov DWORD [16+ebx],ecx
- jz NEAR L$025aw_end
+ jz NEAR L$011aw_end
; Tail Round 5
mov ecx,DWORD [20+esi]
mov edx,DWORD [20+edi]
@@ -958,7 +514,7 @@
adc eax,0
dec ebp
mov DWORD [20+ebx],ecx
- jz NEAR L$025aw_end
+ jz NEAR L$011aw_end
; Tail Round 6
mov ecx,DWORD [24+esi]
mov edx,DWORD [24+edi]
@@ -968,14 +524,12 @@
sub ecx,edx
adc eax,0
mov DWORD [24+ebx],ecx
-L$025aw_end:
+L$011aw_end:
pop edi
pop esi
pop ebx
pop ebp
ret
-segment .bss
-common _OPENSSL_ia32cap_P 16
%else
; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
ret
diff --git a/gen/bcm/sha512-586-apple.S b/gen/bcm/sha512-586-apple.S
index cfdeac1..d4d05cb 100644
--- a/gen/bcm/sha512-586-apple.S
+++ b/gen/bcm/sha512-586-apple.S
@@ -32,8 +32,6 @@
movl %ebx,12(%esp)
movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx
movl (%edx),%ecx
- testl $67108864,%ecx
- jz L002loop_x86
movl 4(%edx),%edx
movq (%esi),%mm0
andl $16777216,%ecx
@@ -47,11 +45,11 @@
movq 48(%esi),%mm6
movq 56(%esi),%mm7
cmpl $16777728,%ecx
- je L003SSSE3
+ je L002SSSE3
subl $80,%esp
- jmp L004loop_sse2
+ jmp L003loop_sse2
.align 4,0x90
-L004loop_sse2:
+L003loop_sse2:
movq %mm1,8(%esp)
movq %mm2,16(%esp)
movq %mm3,24(%esp)
@@ -66,9 +64,9 @@
movl $15,%edx
bswap %eax
bswap %ebx
- jmp L00500_14_sse2
+ jmp L00400_14_sse2
.align 4,0x90
-L00500_14_sse2:
+L00400_14_sse2:
movd %eax,%mm1
movl (%edi),%eax
movd %ebx,%mm7
@@ -129,7 +127,7 @@
paddq %mm6,%mm3
movq 48(%esp),%mm6
decl %edx
- jnz L00500_14_sse2
+ jnz L00400_14_sse2
movd %eax,%mm1
movd %ebx,%mm7
punpckldq %mm1,%mm7
@@ -185,9 +183,9 @@
paddq %mm6,%mm3
pxor %mm0,%mm0
movl $32,%edx
- jmp L00616_79_sse2
+ jmp L00516_79_sse2
.align 4,0x90
-L00616_79_sse2:
+L00516_79_sse2:
movq 88(%esp),%mm5
movq %mm7,%mm1
psrlq $1,%mm7
@@ -341,7 +339,7 @@
paddq %mm6,%mm0
addl $8,%ebp
decl %edx
- jnz L00616_79_sse2
+ jnz L00516_79_sse2
paddq %mm3,%mm0
movq 8(%esp),%mm1
movq 24(%esp),%mm3
@@ -369,7 +367,7 @@
leal (%esp,%eax,1),%esp
subl %eax,%ebp
cmpl 88(%esp),%edi
- jb L004loop_sse2
+ jb L003loop_sse2
movl 92(%esp),%esp
emms
popl %edi
@@ -378,7 +376,7 @@
popl %ebp
ret
.align 5,0x90
-L003SSSE3:
+L002SSSE3:
leal -64(%esp),%edx
subl $256,%esp
movdqa 640(%ebp),%xmm1
@@ -435,7 +433,7 @@
movdqa %xmm2,-16(%edx)
nop
.align 5,0x90
-L007loop_ssse3:
+L006loop_ssse3:
movdqa 16(%edx),%xmm2
movdqa %xmm3,48(%edx)
leal 128(%ebp),%ebp
@@ -452,9 +450,9 @@
pxor %mm1,%mm2
movq %mm7,56(%esp)
pxor %mm3,%mm3
- jmp L00800_47_ssse3
+ jmp L00700_47_ssse3
.align 5,0x90
-L00800_47_ssse3:
+L00700_47_ssse3:
movdqa %xmm5,%xmm3
movdqa %xmm2,%xmm1
.byte 102,15,58,15,208,8
@@ -1473,7 +1471,7 @@
movdqa %xmm1,-16(%edx)
leal 128(%ebp),%ebp
decl %ecx
- jnz L00800_47_ssse3
+ jnz L00700_47_ssse3
movdqa (%ebp),%xmm1
leal -640(%ebp),%ebp
movdqu (%ebx),%xmm0
@@ -2285,7 +2283,7 @@
movq %mm6,48(%esi)
movq %mm7,56(%esi)
cmpl %eax,%edi
- jb L007loop_ssse3
+ jb L006loop_ssse3
movl 76(%edx),%esp
emms
popl %edi
@@ -2293,454 +2291,6 @@
popl %ebx
popl %ebp
ret
-.align 4,0x90
-L002loop_x86:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- movl 28(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- movl 44(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- movl 60(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 64(%edi),%eax
- movl 68(%edi),%ebx
- movl 72(%edi),%ecx
- movl 76(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 80(%edi),%eax
- movl 84(%edi),%ebx
- movl 88(%edi),%ecx
- movl 92(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 96(%edi),%eax
- movl 100(%edi),%ebx
- movl 104(%edi),%ecx
- movl 108(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 112(%edi),%eax
- movl 116(%edi),%ebx
- movl 120(%edi),%ecx
- movl 124(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- addl $128,%edi
- subl $72,%esp
- movl %edi,204(%esp)
- leal 8(%esp),%edi
- movl $16,%ecx
-.long 2784229001
-.align 4,0x90
-L00900_15_x86:
- movl 40(%esp),%ecx
- movl 44(%esp),%edx
- movl %ecx,%esi
- shrl $9,%ecx
- movl %edx,%edi
- shrl $9,%edx
- movl %ecx,%ebx
- shll $14,%esi
- movl %edx,%eax
- shll $14,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%eax
- shll $4,%esi
- xorl %edx,%ebx
- shll $4,%edi
- xorl %esi,%ebx
- shrl $4,%ecx
- xorl %edi,%eax
- shrl $4,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 48(%esp),%ecx
- movl 52(%esp),%edx
- movl 56(%esp),%esi
- movl 60(%esp),%edi
- addl 64(%esp),%eax
- adcl 68(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- andl 40(%esp),%ecx
- andl 44(%esp),%edx
- addl 192(%esp),%eax
- adcl 196(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- movl (%ebp),%esi
- movl 4(%ebp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 32(%esp),%ecx
- movl 36(%esp),%edx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl %eax,32(%esp)
- movl %ebx,36(%esp)
- movl %ecx,%esi
- shrl $2,%ecx
- movl %edx,%edi
- shrl $2,%edx
- movl %ecx,%ebx
- shll $4,%esi
- movl %edx,%eax
- shll $4,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%ebx
- shll $21,%esi
- xorl %edx,%eax
- shll $21,%edi
- xorl %esi,%eax
- shrl $21,%ecx
- xorl %edi,%ebx
- shrl $21,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl 16(%esp),%esi
- movl 20(%esp),%edi
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- orl %esi,%ecx
- orl %edi,%edx
- andl 24(%esp),%ecx
- andl 28(%esp),%edx
- andl 8(%esp),%esi
- andl 12(%esp),%edi
- orl %esi,%ecx
- orl %edi,%edx
- addl %ecx,%eax
- adcl %edx,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movb (%ebp),%dl
- subl $8,%esp
- leal 8(%ebp),%ebp
- cmpb $148,%dl
- jne L00900_15_x86
-.align 4,0x90
-L01016_79_x86:
- movl 312(%esp),%ecx
- movl 316(%esp),%edx
- movl %ecx,%esi
- shrl $1,%ecx
- movl %edx,%edi
- shrl $1,%edx
- movl %ecx,%eax
- shll $24,%esi
- movl %edx,%ebx
- shll $24,%edi
- xorl %esi,%ebx
- shrl $6,%ecx
- xorl %edi,%eax
- shrl $6,%edx
- xorl %ecx,%eax
- shll $7,%esi
- xorl %edx,%ebx
- shll $1,%edi
- xorl %esi,%ebx
- shrl $1,%ecx
- xorl %edi,%eax
- shrl $1,%edx
- xorl %ecx,%eax
- shll $6,%edi
- xorl %edx,%ebx
- xorl %edi,%eax
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movl 208(%esp),%ecx
- movl 212(%esp),%edx
- movl %ecx,%esi
- shrl $6,%ecx
- movl %edx,%edi
- shrl $6,%edx
- movl %ecx,%eax
- shll $3,%esi
- movl %edx,%ebx
- shll $3,%edi
- xorl %esi,%eax
- shrl $13,%ecx
- xorl %edi,%ebx
- shrl $13,%edx
- xorl %ecx,%eax
- shll $10,%esi
- xorl %edx,%ebx
- shll $10,%edi
- xorl %esi,%ebx
- shrl $10,%ecx
- xorl %edi,%eax
- shrl $10,%edx
- xorl %ecx,%ebx
- shll $13,%edi
- xorl %edx,%eax
- xorl %edi,%eax
- movl 320(%esp),%ecx
- movl 324(%esp),%edx
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- movl 248(%esp),%esi
- movl 252(%esp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,192(%esp)
- movl %ebx,196(%esp)
- movl 40(%esp),%ecx
- movl 44(%esp),%edx
- movl %ecx,%esi
- shrl $9,%ecx
- movl %edx,%edi
- shrl $9,%edx
- movl %ecx,%ebx
- shll $14,%esi
- movl %edx,%eax
- shll $14,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%eax
- shll $4,%esi
- xorl %edx,%ebx
- shll $4,%edi
- xorl %esi,%ebx
- shrl $4,%ecx
- xorl %edi,%eax
- shrl $4,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 48(%esp),%ecx
- movl 52(%esp),%edx
- movl 56(%esp),%esi
- movl 60(%esp),%edi
- addl 64(%esp),%eax
- adcl 68(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- andl 40(%esp),%ecx
- andl 44(%esp),%edx
- addl 192(%esp),%eax
- adcl 196(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- movl (%ebp),%esi
- movl 4(%ebp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 32(%esp),%ecx
- movl 36(%esp),%edx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl %eax,32(%esp)
- movl %ebx,36(%esp)
- movl %ecx,%esi
- shrl $2,%ecx
- movl %edx,%edi
- shrl $2,%edx
- movl %ecx,%ebx
- shll $4,%esi
- movl %edx,%eax
- shll $4,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%ebx
- shll $21,%esi
- xorl %edx,%eax
- shll $21,%edi
- xorl %esi,%eax
- shrl $21,%ecx
- xorl %edi,%ebx
- shrl $21,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl 16(%esp),%esi
- movl 20(%esp),%edi
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- orl %esi,%ecx
- orl %edi,%edx
- andl 24(%esp),%ecx
- andl 28(%esp),%edx
- andl 8(%esp),%esi
- andl 12(%esp),%edi
- orl %esi,%ecx
- orl %edi,%edx
- addl %ecx,%eax
- adcl %edx,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movb (%ebp),%dl
- subl $8,%esp
- leal 8(%ebp),%ebp
- cmpb $23,%dl
- jne L01016_79_x86
- movl 840(%esp),%esi
- movl 844(%esp),%edi
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edx
- addl 8(%esp),%eax
- adcl 12(%esp),%ebx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- addl 16(%esp),%ecx
- adcl 20(%esp),%edx
- movl %ecx,8(%esi)
- movl %edx,12(%esi)
- movl 16(%esi),%eax
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edx
- addl 24(%esp),%eax
- adcl 28(%esp),%ebx
- movl %eax,16(%esi)
- movl %ebx,20(%esi)
- addl 32(%esp),%ecx
- adcl 36(%esp),%edx
- movl %ecx,24(%esi)
- movl %edx,28(%esi)
- movl 32(%esi),%eax
- movl 36(%esi),%ebx
- movl 40(%esi),%ecx
- movl 44(%esi),%edx
- addl 40(%esp),%eax
- adcl 44(%esp),%ebx
- movl %eax,32(%esi)
- movl %ebx,36(%esi)
- addl 48(%esp),%ecx
- adcl 52(%esp),%edx
- movl %ecx,40(%esi)
- movl %edx,44(%esi)
- movl 48(%esi),%eax
- movl 52(%esi),%ebx
- movl 56(%esi),%ecx
- movl 60(%esi),%edx
- addl 56(%esp),%eax
- adcl 60(%esp),%ebx
- movl %eax,48(%esi)
- movl %ebx,52(%esi)
- addl 64(%esp),%ecx
- adcl 68(%esp),%edx
- movl %ecx,56(%esi)
- movl %edx,60(%esi)
- addl $840,%esp
- subl $640,%ebp
- cmpl 8(%esp),%edi
- jb L002loop_x86
- movl 12(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.align 6,0x90
L001K512:
.long 3609767458,1116352408
diff --git a/gen/bcm/sha512-586-linux.S b/gen/bcm/sha512-586-linux.S
index bb2884d..3dc0ecb 100644
--- a/gen/bcm/sha512-586-linux.S
+++ b/gen/bcm/sha512-586-linux.S
@@ -33,8 +33,6 @@
movl %ebx,12(%esp)
leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
movl (%edx),%ecx
- testl $67108864,%ecx
- jz .L002loop_x86
movl 4(%edx),%edx
movq (%esi),%mm0
andl $16777216,%ecx
@@ -48,11 +46,11 @@
movq 48(%esi),%mm6
movq 56(%esi),%mm7
cmpl $16777728,%ecx
- je .L003SSSE3
+ je .L002SSSE3
subl $80,%esp
- jmp .L004loop_sse2
+ jmp .L003loop_sse2
.align 16
-.L004loop_sse2:
+.L003loop_sse2:
movq %mm1,8(%esp)
movq %mm2,16(%esp)
movq %mm3,24(%esp)
@@ -67,9 +65,9 @@
movl $15,%edx
bswap %eax
bswap %ebx
- jmp .L00500_14_sse2
+ jmp .L00400_14_sse2
.align 16
-.L00500_14_sse2:
+.L00400_14_sse2:
movd %eax,%mm1
movl (%edi),%eax
movd %ebx,%mm7
@@ -130,7 +128,7 @@
paddq %mm6,%mm3
movq 48(%esp),%mm6
decl %edx
- jnz .L00500_14_sse2
+ jnz .L00400_14_sse2
movd %eax,%mm1
movd %ebx,%mm7
punpckldq %mm1,%mm7
@@ -186,9 +184,9 @@
paddq %mm6,%mm3
pxor %mm0,%mm0
movl $32,%edx
- jmp .L00616_79_sse2
+ jmp .L00516_79_sse2
.align 16
-.L00616_79_sse2:
+.L00516_79_sse2:
movq 88(%esp),%mm5
movq %mm7,%mm1
psrlq $1,%mm7
@@ -342,7 +340,7 @@
paddq %mm6,%mm0
addl $8,%ebp
decl %edx
- jnz .L00616_79_sse2
+ jnz .L00516_79_sse2
paddq %mm3,%mm0
movq 8(%esp),%mm1
movq 24(%esp),%mm3
@@ -370,7 +368,7 @@
leal (%esp,%eax,1),%esp
subl %eax,%ebp
cmpl 88(%esp),%edi
- jb .L004loop_sse2
+ jb .L003loop_sse2
movl 92(%esp),%esp
emms
popl %edi
@@ -379,7 +377,7 @@
popl %ebp
ret
.align 32
-.L003SSSE3:
+.L002SSSE3:
leal -64(%esp),%edx
subl $256,%esp
movdqa 640(%ebp),%xmm1
@@ -436,7 +434,7 @@
movdqa %xmm2,-16(%edx)
nop
.align 32
-.L007loop_ssse3:
+.L006loop_ssse3:
movdqa 16(%edx),%xmm2
movdqa %xmm3,48(%edx)
leal 128(%ebp),%ebp
@@ -453,9 +451,9 @@
pxor %mm1,%mm2
movq %mm7,56(%esp)
pxor %mm3,%mm3
- jmp .L00800_47_ssse3
+ jmp .L00700_47_ssse3
.align 32
-.L00800_47_ssse3:
+.L00700_47_ssse3:
movdqa %xmm5,%xmm3
movdqa %xmm2,%xmm1
.byte 102,15,58,15,208,8
@@ -1474,7 +1472,7 @@
movdqa %xmm1,-16(%edx)
leal 128(%ebp),%ebp
decl %ecx
- jnz .L00800_47_ssse3
+ jnz .L00700_47_ssse3
movdqa (%ebp),%xmm1
leal -640(%ebp),%ebp
movdqu (%ebx),%xmm0
@@ -2286,7 +2284,7 @@
movq %mm6,48(%esi)
movq %mm7,56(%esi)
cmpl %eax,%edi
- jb .L007loop_ssse3
+ jb .L006loop_ssse3
movl 76(%edx),%esp
emms
popl %edi
@@ -2294,454 +2292,6 @@
popl %ebx
popl %ebp
ret
-.align 16
-.L002loop_x86:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- movl 28(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- movl 44(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- movl 60(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 64(%edi),%eax
- movl 68(%edi),%ebx
- movl 72(%edi),%ecx
- movl 76(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 80(%edi),%eax
- movl 84(%edi),%ebx
- movl 88(%edi),%ecx
- movl 92(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 96(%edi),%eax
- movl 100(%edi),%ebx
- movl 104(%edi),%ecx
- movl 108(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 112(%edi),%eax
- movl 116(%edi),%ebx
- movl 120(%edi),%ecx
- movl 124(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- addl $128,%edi
- subl $72,%esp
- movl %edi,204(%esp)
- leal 8(%esp),%edi
- movl $16,%ecx
-.long 2784229001
-.align 16
-.L00900_15_x86:
- movl 40(%esp),%ecx
- movl 44(%esp),%edx
- movl %ecx,%esi
- shrl $9,%ecx
- movl %edx,%edi
- shrl $9,%edx
- movl %ecx,%ebx
- shll $14,%esi
- movl %edx,%eax
- shll $14,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%eax
- shll $4,%esi
- xorl %edx,%ebx
- shll $4,%edi
- xorl %esi,%ebx
- shrl $4,%ecx
- xorl %edi,%eax
- shrl $4,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 48(%esp),%ecx
- movl 52(%esp),%edx
- movl 56(%esp),%esi
- movl 60(%esp),%edi
- addl 64(%esp),%eax
- adcl 68(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- andl 40(%esp),%ecx
- andl 44(%esp),%edx
- addl 192(%esp),%eax
- adcl 196(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- movl (%ebp),%esi
- movl 4(%ebp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 32(%esp),%ecx
- movl 36(%esp),%edx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl %eax,32(%esp)
- movl %ebx,36(%esp)
- movl %ecx,%esi
- shrl $2,%ecx
- movl %edx,%edi
- shrl $2,%edx
- movl %ecx,%ebx
- shll $4,%esi
- movl %edx,%eax
- shll $4,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%ebx
- shll $21,%esi
- xorl %edx,%eax
- shll $21,%edi
- xorl %esi,%eax
- shrl $21,%ecx
- xorl %edi,%ebx
- shrl $21,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl 16(%esp),%esi
- movl 20(%esp),%edi
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- orl %esi,%ecx
- orl %edi,%edx
- andl 24(%esp),%ecx
- andl 28(%esp),%edx
- andl 8(%esp),%esi
- andl 12(%esp),%edi
- orl %esi,%ecx
- orl %edi,%edx
- addl %ecx,%eax
- adcl %edx,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movb (%ebp),%dl
- subl $8,%esp
- leal 8(%ebp),%ebp
- cmpb $148,%dl
- jne .L00900_15_x86
-.align 16
-.L01016_79_x86:
- movl 312(%esp),%ecx
- movl 316(%esp),%edx
- movl %ecx,%esi
- shrl $1,%ecx
- movl %edx,%edi
- shrl $1,%edx
- movl %ecx,%eax
- shll $24,%esi
- movl %edx,%ebx
- shll $24,%edi
- xorl %esi,%ebx
- shrl $6,%ecx
- xorl %edi,%eax
- shrl $6,%edx
- xorl %ecx,%eax
- shll $7,%esi
- xorl %edx,%ebx
- shll $1,%edi
- xorl %esi,%ebx
- shrl $1,%ecx
- xorl %edi,%eax
- shrl $1,%edx
- xorl %ecx,%eax
- shll $6,%edi
- xorl %edx,%ebx
- xorl %edi,%eax
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movl 208(%esp),%ecx
- movl 212(%esp),%edx
- movl %ecx,%esi
- shrl $6,%ecx
- movl %edx,%edi
- shrl $6,%edx
- movl %ecx,%eax
- shll $3,%esi
- movl %edx,%ebx
- shll $3,%edi
- xorl %esi,%eax
- shrl $13,%ecx
- xorl %edi,%ebx
- shrl $13,%edx
- xorl %ecx,%eax
- shll $10,%esi
- xorl %edx,%ebx
- shll $10,%edi
- xorl %esi,%ebx
- shrl $10,%ecx
- xorl %edi,%eax
- shrl $10,%edx
- xorl %ecx,%ebx
- shll $13,%edi
- xorl %edx,%eax
- xorl %edi,%eax
- movl 320(%esp),%ecx
- movl 324(%esp),%edx
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- movl 248(%esp),%esi
- movl 252(%esp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,192(%esp)
- movl %ebx,196(%esp)
- movl 40(%esp),%ecx
- movl 44(%esp),%edx
- movl %ecx,%esi
- shrl $9,%ecx
- movl %edx,%edi
- shrl $9,%edx
- movl %ecx,%ebx
- shll $14,%esi
- movl %edx,%eax
- shll $14,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%eax
- shll $4,%esi
- xorl %edx,%ebx
- shll $4,%edi
- xorl %esi,%ebx
- shrl $4,%ecx
- xorl %edi,%eax
- shrl $4,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 48(%esp),%ecx
- movl 52(%esp),%edx
- movl 56(%esp),%esi
- movl 60(%esp),%edi
- addl 64(%esp),%eax
- adcl 68(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- andl 40(%esp),%ecx
- andl 44(%esp),%edx
- addl 192(%esp),%eax
- adcl 196(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- movl (%ebp),%esi
- movl 4(%ebp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 32(%esp),%ecx
- movl 36(%esp),%edx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl %eax,32(%esp)
- movl %ebx,36(%esp)
- movl %ecx,%esi
- shrl $2,%ecx
- movl %edx,%edi
- shrl $2,%edx
- movl %ecx,%ebx
- shll $4,%esi
- movl %edx,%eax
- shll $4,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%ebx
- shll $21,%esi
- xorl %edx,%eax
- shll $21,%edi
- xorl %esi,%eax
- shrl $21,%ecx
- xorl %edi,%ebx
- shrl $21,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl 16(%esp),%esi
- movl 20(%esp),%edi
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- orl %esi,%ecx
- orl %edi,%edx
- andl 24(%esp),%ecx
- andl 28(%esp),%edx
- andl 8(%esp),%esi
- andl 12(%esp),%edi
- orl %esi,%ecx
- orl %edi,%edx
- addl %ecx,%eax
- adcl %edx,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movb (%ebp),%dl
- subl $8,%esp
- leal 8(%ebp),%ebp
- cmpb $23,%dl
- jne .L01016_79_x86
- movl 840(%esp),%esi
- movl 844(%esp),%edi
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edx
- addl 8(%esp),%eax
- adcl 12(%esp),%ebx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- addl 16(%esp),%ecx
- adcl 20(%esp),%edx
- movl %ecx,8(%esi)
- movl %edx,12(%esi)
- movl 16(%esi),%eax
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edx
- addl 24(%esp),%eax
- adcl 28(%esp),%ebx
- movl %eax,16(%esi)
- movl %ebx,20(%esi)
- addl 32(%esp),%ecx
- adcl 36(%esp),%edx
- movl %ecx,24(%esi)
- movl %edx,28(%esi)
- movl 32(%esi),%eax
- movl 36(%esi),%ebx
- movl 40(%esi),%ecx
- movl 44(%esi),%edx
- addl 40(%esp),%eax
- adcl 44(%esp),%ebx
- movl %eax,32(%esi)
- movl %ebx,36(%esi)
- addl 48(%esp),%ecx
- adcl 52(%esp),%edx
- movl %ecx,40(%esi)
- movl %edx,44(%esi)
- movl 48(%esi),%eax
- movl 52(%esi),%ebx
- movl 56(%esi),%ecx
- movl 60(%esi),%edx
- addl 56(%esp),%eax
- adcl 60(%esp),%ebx
- movl %eax,48(%esi)
- movl %ebx,52(%esi)
- addl 64(%esp),%ecx
- adcl 68(%esp),%edx
- movl %ecx,56(%esi)
- movl %edx,60(%esi)
- addl $840,%esp
- subl $640,%ebp
- cmpl 8(%esp),%edi
- jb .L002loop_x86
- movl 12(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.align 64
.L001K512:
.long 3609767458,1116352408
diff --git a/gen/bcm/sha512-586-win.asm b/gen/bcm/sha512-586-win.asm
index 3603a6d..ba55f7d 100644
--- a/gen/bcm/sha512-586-win.asm
+++ b/gen/bcm/sha512-586-win.asm
@@ -40,8 +40,6 @@
mov DWORD [12+esp],ebx
lea edx,[_OPENSSL_ia32cap_P]
mov ecx,DWORD [edx]
- test ecx,67108864
- jz NEAR L$002loop_x86
mov edx,DWORD [4+edx]
movq mm0,[esi]
and ecx,16777216
@@ -55,11 +53,11 @@
movq mm6,[48+esi]
movq mm7,[56+esi]
cmp ecx,16777728
- je NEAR L$003SSSE3
+ je NEAR L$002SSSE3
sub esp,80
- jmp NEAR L$004loop_sse2
+ jmp NEAR L$003loop_sse2
align 16
-L$004loop_sse2:
+L$003loop_sse2:
movq [8+esp],mm1
movq [16+esp],mm2
movq [24+esp],mm3
@@ -74,9 +72,9 @@
mov edx,15
bswap eax
bswap ebx
- jmp NEAR L$00500_14_sse2
+ jmp NEAR L$00400_14_sse2
align 16
-L$00500_14_sse2:
+L$00400_14_sse2:
movd mm1,eax
mov eax,DWORD [edi]
movd mm7,ebx
@@ -137,7 +135,7 @@
paddq mm3,mm6
movq mm6,[48+esp]
dec edx
- jnz NEAR L$00500_14_sse2
+ jnz NEAR L$00400_14_sse2
movd mm1,eax
movd mm7,ebx
punpckldq mm7,mm1
@@ -193,9 +191,9 @@
paddq mm3,mm6
pxor mm0,mm0
mov edx,32
- jmp NEAR L$00616_79_sse2
+ jmp NEAR L$00516_79_sse2
align 16
-L$00616_79_sse2:
+L$00516_79_sse2:
movq mm5,[88+esp]
movq mm1,mm7
psrlq mm7,1
@@ -349,7 +347,7 @@
paddq mm0,mm6
add ebp,8
dec edx
- jnz NEAR L$00616_79_sse2
+ jnz NEAR L$00516_79_sse2
paddq mm0,mm3
movq mm1,[8+esp]
movq mm3,[24+esp]
@@ -377,7 +375,7 @@
lea esp,[eax*1+esp]
sub ebp,eax
cmp edi,DWORD [88+esp]
- jb NEAR L$004loop_sse2
+ jb NEAR L$003loop_sse2
mov esp,DWORD [92+esp]
emms
pop edi
@@ -386,7 +384,7 @@
pop ebp
ret
align 32
-L$003SSSE3:
+L$002SSSE3:
lea edx,[esp-64]
sub esp,256
movdqa xmm1,[640+ebp]
@@ -443,7 +441,7 @@
movdqa [edx-16],xmm2
nop
align 32
-L$007loop_ssse3:
+L$006loop_ssse3:
movdqa xmm2,[16+edx]
movdqa [48+edx],xmm3
lea ebp,[128+ebp]
@@ -460,9 +458,9 @@
pxor mm2,mm1
movq [56+esp],mm7
pxor mm3,mm3
- jmp NEAR L$00800_47_ssse3
+ jmp NEAR L$00700_47_ssse3
align 32
-L$00800_47_ssse3:
+L$00700_47_ssse3:
movdqa xmm3,xmm5
movdqa xmm1,xmm2
db 102,15,58,15,208,8
@@ -1481,7 +1479,7 @@
movdqa [edx-16],xmm1
lea ebp,[128+ebp]
dec ecx
- jnz NEAR L$00800_47_ssse3
+ jnz NEAR L$00700_47_ssse3
movdqa xmm1,[ebp]
lea ebp,[ebp-640]
movdqu xmm0,[ebx]
@@ -2293,7 +2291,7 @@
movq [48+esi],mm6
movq [56+esi],mm7
cmp edi,eax
- jb NEAR L$007loop_ssse3
+ jb NEAR L$006loop_ssse3
mov esp,DWORD [76+edx]
emms
pop edi
@@ -2301,454 +2299,6 @@
pop ebx
pop ebp
ret
-align 16
-L$002loop_x86:
- mov eax,DWORD [edi]
- mov ebx,DWORD [4+edi]
- mov ecx,DWORD [8+edi]
- mov edx,DWORD [12+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- mov eax,DWORD [16+edi]
- mov ebx,DWORD [20+edi]
- mov ecx,DWORD [24+edi]
- mov edx,DWORD [28+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- mov eax,DWORD [32+edi]
- mov ebx,DWORD [36+edi]
- mov ecx,DWORD [40+edi]
- mov edx,DWORD [44+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- mov eax,DWORD [48+edi]
- mov ebx,DWORD [52+edi]
- mov ecx,DWORD [56+edi]
- mov edx,DWORD [60+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- mov eax,DWORD [64+edi]
- mov ebx,DWORD [68+edi]
- mov ecx,DWORD [72+edi]
- mov edx,DWORD [76+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- mov eax,DWORD [80+edi]
- mov ebx,DWORD [84+edi]
- mov ecx,DWORD [88+edi]
- mov edx,DWORD [92+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- mov eax,DWORD [96+edi]
- mov ebx,DWORD [100+edi]
- mov ecx,DWORD [104+edi]
- mov edx,DWORD [108+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- mov eax,DWORD [112+edi]
- mov ebx,DWORD [116+edi]
- mov ecx,DWORD [120+edi]
- mov edx,DWORD [124+edi]
- bswap eax
- bswap ebx
- bswap ecx
- bswap edx
- push eax
- push ebx
- push ecx
- push edx
- add edi,128
- sub esp,72
- mov DWORD [204+esp],edi
- lea edi,[8+esp]
- mov ecx,16
-dd 2784229001
-align 16
-L$00900_15_x86:
- mov ecx,DWORD [40+esp]
- mov edx,DWORD [44+esp]
- mov esi,ecx
- shr ecx,9
- mov edi,edx
- shr edx,9
- mov ebx,ecx
- shl esi,14
- mov eax,edx
- shl edi,14
- xor ebx,esi
- shr ecx,5
- xor eax,edi
- shr edx,5
- xor eax,ecx
- shl esi,4
- xor ebx,edx
- shl edi,4
- xor ebx,esi
- shr ecx,4
- xor eax,edi
- shr edx,4
- xor eax,ecx
- shl esi,5
- xor ebx,edx
- shl edi,5
- xor eax,esi
- xor ebx,edi
- mov ecx,DWORD [48+esp]
- mov edx,DWORD [52+esp]
- mov esi,DWORD [56+esp]
- mov edi,DWORD [60+esp]
- add eax,DWORD [64+esp]
- adc ebx,DWORD [68+esp]
- xor ecx,esi
- xor edx,edi
- and ecx,DWORD [40+esp]
- and edx,DWORD [44+esp]
- add eax,DWORD [192+esp]
- adc ebx,DWORD [196+esp]
- xor ecx,esi
- xor edx,edi
- mov esi,DWORD [ebp]
- mov edi,DWORD [4+ebp]
- add eax,ecx
- adc ebx,edx
- mov ecx,DWORD [32+esp]
- mov edx,DWORD [36+esp]
- add eax,esi
- adc ebx,edi
- mov DWORD [esp],eax
- mov DWORD [4+esp],ebx
- add eax,ecx
- adc ebx,edx
- mov ecx,DWORD [8+esp]
- mov edx,DWORD [12+esp]
- mov DWORD [32+esp],eax
- mov DWORD [36+esp],ebx
- mov esi,ecx
- shr ecx,2
- mov edi,edx
- shr edx,2
- mov ebx,ecx
- shl esi,4
- mov eax,edx
- shl edi,4
- xor ebx,esi
- shr ecx,5
- xor eax,edi
- shr edx,5
- xor ebx,ecx
- shl esi,21
- xor eax,edx
- shl edi,21
- xor eax,esi
- shr ecx,21
- xor ebx,edi
- shr edx,21
- xor eax,ecx
- shl esi,5
- xor ebx,edx
- shl edi,5
- xor eax,esi
- xor ebx,edi
- mov ecx,DWORD [8+esp]
- mov edx,DWORD [12+esp]
- mov esi,DWORD [16+esp]
- mov edi,DWORD [20+esp]
- add eax,DWORD [esp]
- adc ebx,DWORD [4+esp]
- or ecx,esi
- or edx,edi
- and ecx,DWORD [24+esp]
- and edx,DWORD [28+esp]
- and esi,DWORD [8+esp]
- and edi,DWORD [12+esp]
- or ecx,esi
- or edx,edi
- add eax,ecx
- adc ebx,edx
- mov DWORD [esp],eax
- mov DWORD [4+esp],ebx
- mov dl,BYTE [ebp]
- sub esp,8
- lea ebp,[8+ebp]
- cmp dl,148
- jne NEAR L$00900_15_x86
-align 16
-L$01016_79_x86:
- mov ecx,DWORD [312+esp]
- mov edx,DWORD [316+esp]
- mov esi,ecx
- shr ecx,1
- mov edi,edx
- shr edx,1
- mov eax,ecx
- shl esi,24
- mov ebx,edx
- shl edi,24
- xor ebx,esi
- shr ecx,6
- xor eax,edi
- shr edx,6
- xor eax,ecx
- shl esi,7
- xor ebx,edx
- shl edi,1
- xor ebx,esi
- shr ecx,1
- xor eax,edi
- shr edx,1
- xor eax,ecx
- shl edi,6
- xor ebx,edx
- xor eax,edi
- mov DWORD [esp],eax
- mov DWORD [4+esp],ebx
- mov ecx,DWORD [208+esp]
- mov edx,DWORD [212+esp]
- mov esi,ecx
- shr ecx,6
- mov edi,edx
- shr edx,6
- mov eax,ecx
- shl esi,3
- mov ebx,edx
- shl edi,3
- xor eax,esi
- shr ecx,13
- xor ebx,edi
- shr edx,13
- xor eax,ecx
- shl esi,10
- xor ebx,edx
- shl edi,10
- xor ebx,esi
- shr ecx,10
- xor eax,edi
- shr edx,10
- xor ebx,ecx
- shl edi,13
- xor eax,edx
- xor eax,edi
- mov ecx,DWORD [320+esp]
- mov edx,DWORD [324+esp]
- add eax,DWORD [esp]
- adc ebx,DWORD [4+esp]
- mov esi,DWORD [248+esp]
- mov edi,DWORD [252+esp]
- add eax,ecx
- adc ebx,edx
- add eax,esi
- adc ebx,edi
- mov DWORD [192+esp],eax
- mov DWORD [196+esp],ebx
- mov ecx,DWORD [40+esp]
- mov edx,DWORD [44+esp]
- mov esi,ecx
- shr ecx,9
- mov edi,edx
- shr edx,9
- mov ebx,ecx
- shl esi,14
- mov eax,edx
- shl edi,14
- xor ebx,esi
- shr ecx,5
- xor eax,edi
- shr edx,5
- xor eax,ecx
- shl esi,4
- xor ebx,edx
- shl edi,4
- xor ebx,esi
- shr ecx,4
- xor eax,edi
- shr edx,4
- xor eax,ecx
- shl esi,5
- xor ebx,edx
- shl edi,5
- xor eax,esi
- xor ebx,edi
- mov ecx,DWORD [48+esp]
- mov edx,DWORD [52+esp]
- mov esi,DWORD [56+esp]
- mov edi,DWORD [60+esp]
- add eax,DWORD [64+esp]
- adc ebx,DWORD [68+esp]
- xor ecx,esi
- xor edx,edi
- and ecx,DWORD [40+esp]
- and edx,DWORD [44+esp]
- add eax,DWORD [192+esp]
- adc ebx,DWORD [196+esp]
- xor ecx,esi
- xor edx,edi
- mov esi,DWORD [ebp]
- mov edi,DWORD [4+ebp]
- add eax,ecx
- adc ebx,edx
- mov ecx,DWORD [32+esp]
- mov edx,DWORD [36+esp]
- add eax,esi
- adc ebx,edi
- mov DWORD [esp],eax
- mov DWORD [4+esp],ebx
- add eax,ecx
- adc ebx,edx
- mov ecx,DWORD [8+esp]
- mov edx,DWORD [12+esp]
- mov DWORD [32+esp],eax
- mov DWORD [36+esp],ebx
- mov esi,ecx
- shr ecx,2
- mov edi,edx
- shr edx,2
- mov ebx,ecx
- shl esi,4
- mov eax,edx
- shl edi,4
- xor ebx,esi
- shr ecx,5
- xor eax,edi
- shr edx,5
- xor ebx,ecx
- shl esi,21
- xor eax,edx
- shl edi,21
- xor eax,esi
- shr ecx,21
- xor ebx,edi
- shr edx,21
- xor eax,ecx
- shl esi,5
- xor ebx,edx
- shl edi,5
- xor eax,esi
- xor ebx,edi
- mov ecx,DWORD [8+esp]
- mov edx,DWORD [12+esp]
- mov esi,DWORD [16+esp]
- mov edi,DWORD [20+esp]
- add eax,DWORD [esp]
- adc ebx,DWORD [4+esp]
- or ecx,esi
- or edx,edi
- and ecx,DWORD [24+esp]
- and edx,DWORD [28+esp]
- and esi,DWORD [8+esp]
- and edi,DWORD [12+esp]
- or ecx,esi
- or edx,edi
- add eax,ecx
- adc ebx,edx
- mov DWORD [esp],eax
- mov DWORD [4+esp],ebx
- mov dl,BYTE [ebp]
- sub esp,8
- lea ebp,[8+ebp]
- cmp dl,23
- jne NEAR L$01016_79_x86
- mov esi,DWORD [840+esp]
- mov edi,DWORD [844+esp]
- mov eax,DWORD [esi]
- mov ebx,DWORD [4+esi]
- mov ecx,DWORD [8+esi]
- mov edx,DWORD [12+esi]
- add eax,DWORD [8+esp]
- adc ebx,DWORD [12+esp]
- mov DWORD [esi],eax
- mov DWORD [4+esi],ebx
- add ecx,DWORD [16+esp]
- adc edx,DWORD [20+esp]
- mov DWORD [8+esi],ecx
- mov DWORD [12+esi],edx
- mov eax,DWORD [16+esi]
- mov ebx,DWORD [20+esi]
- mov ecx,DWORD [24+esi]
- mov edx,DWORD [28+esi]
- add eax,DWORD [24+esp]
- adc ebx,DWORD [28+esp]
- mov DWORD [16+esi],eax
- mov DWORD [20+esi],ebx
- add ecx,DWORD [32+esp]
- adc edx,DWORD [36+esp]
- mov DWORD [24+esi],ecx
- mov DWORD [28+esi],edx
- mov eax,DWORD [32+esi]
- mov ebx,DWORD [36+esi]
- mov ecx,DWORD [40+esi]
- mov edx,DWORD [44+esi]
- add eax,DWORD [40+esp]
- adc ebx,DWORD [44+esp]
- mov DWORD [32+esi],eax
- mov DWORD [36+esi],ebx
- add ecx,DWORD [48+esp]
- adc edx,DWORD [52+esp]
- mov DWORD [40+esi],ecx
- mov DWORD [44+esi],edx
- mov eax,DWORD [48+esi]
- mov ebx,DWORD [52+esi]
- mov ecx,DWORD [56+esi]
- mov edx,DWORD [60+esi]
- add eax,DWORD [56+esp]
- adc ebx,DWORD [60+esp]
- mov DWORD [48+esi],eax
- mov DWORD [52+esi],ebx
- add ecx,DWORD [64+esp]
- adc edx,DWORD [68+esp]
- mov DWORD [56+esi],ecx
- mov DWORD [60+esi],edx
- add esp,840
- sub ebp,640
- cmp edi,DWORD [8+esp]
- jb NEAR L$002loop_x86
- mov esp,DWORD [12+esp]
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
align 64
L$001K512:
dd 3609767458,1116352408
diff --git a/gen/bcm/x86-mont-apple.S b/gen/bcm/x86-mont-apple.S
index f991f6c..a8fd1f9 100644
--- a/gen/bcm/x86-mont-apple.S
+++ b/gen/bcm/x86-mont-apple.S
@@ -62,12 +62,6 @@
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %edx,24(%esp)
- call L003PIC_me_up
-L003PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L004non_sse2
movl $-1,%eax
movd %eax,%mm7
movl 8(%esp),%esi
@@ -91,7 +85,7 @@
psrlq $32,%mm3
incl %ecx
.align 4,0x90
-L0051st:
+L0031st:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -106,7 +100,7 @@
psrlq $32,%mm3
leal 1(%ecx),%ecx
cmpl %ebx,%ecx
- jl L0051st
+ jl L0031st
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -120,7 +114,7 @@
paddq %mm2,%mm3
movq %mm3,32(%esp,%ebx,4)
incl %edx
-L006outer:
+L004outer:
xorl %ecx,%ecx
movd (%edi,%edx,4),%mm4
movd (%esi),%mm5
@@ -142,7 +136,7 @@
paddq %mm6,%mm2
incl %ecx
decl %ebx
-L007inner:
+L005inner:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -159,7 +153,7 @@
paddq %mm6,%mm2
decl %ebx
leal 1(%ecx),%ecx
- jnz L007inner
+ jnz L005inner
movl %ecx,%ebx
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
@@ -177,264 +171,11 @@
movq %mm3,32(%esp,%ebx,4)
leal 1(%edx),%edx
cmpl %ebx,%edx
- jle L006outer
+ jle L004outer
emms
- jmp L008common_tail
+ jmp L006common_tail
.align 4,0x90
-L004non_sse2:
- movl 8(%esp),%esi
- leal 1(%ebx),%ebp
- movl 12(%esp),%edi
- xorl %ecx,%ecx
- movl %esi,%edx
- andl $1,%ebp
- subl %edi,%edx
- leal 4(%edi,%ebx,4),%eax
- orl %edx,%ebp
- movl (%edi),%edi
- jz L009bn_sqr_mont
- movl %eax,28(%esp)
- movl (%esi),%eax
- xorl %edx,%edx
-.align 4,0x90
-L010mull:
- movl %edx,%ebp
- mull %edi
- addl %eax,%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- movl (%esi,%ecx,4),%eax
- cmpl %ebx,%ecx
- movl %ebp,28(%esp,%ecx,4)
- jl L010mull
- movl %edx,%ebp
- mull %edi
- movl 20(%esp),%edi
- addl %ebp,%eax
- movl 16(%esp),%esi
- adcl $0,%edx
- imull 32(%esp),%edi
- movl %eax,32(%esp,%ebx,4)
- xorl %ecx,%ecx
- movl %edx,36(%esp,%ebx,4)
- movl %ecx,40(%esp,%ebx,4)
- movl (%esi),%eax
- mull %edi
- addl 32(%esp),%eax
- movl 4(%esi),%eax
- adcl $0,%edx
- incl %ecx
- jmp L0112ndmadd
-.align 4,0x90
-L0121stmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,28(%esp,%ecx,4)
- jl L0121stmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%eax
- movl 20(%esp),%edi
- adcl $0,%edx
- movl 16(%esp),%esi
- addl %eax,%ebp
- adcl $0,%edx
- imull 32(%esp),%edi
- xorl %ecx,%ecx
- addl 36(%esp,%ebx,4),%edx
- movl %ebp,32(%esp,%ebx,4)
- adcl $0,%ecx
- movl (%esi),%eax
- movl %edx,36(%esp,%ebx,4)
- movl %ecx,40(%esp,%ebx,4)
- mull %edi
- addl 32(%esp),%eax
- movl 4(%esi),%eax
- adcl $0,%edx
- movl $1,%ecx
-.align 4,0x90
-L0112ndmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,24(%esp,%ecx,4)
- jl L0112ndmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- adcl $0,%edx
- movl %ebp,28(%esp,%ebx,4)
- xorl %eax,%eax
- movl 12(%esp),%ecx
- addl 36(%esp,%ebx,4),%edx
- adcl 40(%esp,%ebx,4),%eax
- leal 4(%ecx),%ecx
- movl %edx,32(%esp,%ebx,4)
- cmpl 28(%esp),%ecx
- movl %eax,36(%esp,%ebx,4)
- je L008common_tail
- movl (%ecx),%edi
- movl 8(%esp),%esi
- movl %ecx,12(%esp)
- xorl %ecx,%ecx
- xorl %edx,%edx
- movl (%esi),%eax
- jmp L0121stmadd
-.align 4,0x90
-L009bn_sqr_mont:
- movl %ebx,(%esp)
- movl %ecx,12(%esp)
- movl %edi,%eax
- mull %edi
- movl %eax,32(%esp)
- movl %edx,%ebx
- shrl $1,%edx
- andl $1,%ebx
- incl %ecx
-.align 4,0x90
-L013sqr:
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- leal 1(%ecx),%ecx
- adcl $0,%edx
- leal (%ebx,%eax,2),%ebp
- shrl $31,%eax
- cmpl (%esp),%ecx
- movl %eax,%ebx
- movl %ebp,28(%esp,%ecx,4)
- jl L013sqr
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- movl 20(%esp),%edi
- adcl $0,%edx
- movl 16(%esp),%esi
- leal (%ebx,%eax,2),%ebp
- imull 32(%esp),%edi
- shrl $31,%eax
- movl %ebp,32(%esp,%ecx,4)
- leal (%eax,%edx,2),%ebp
- movl (%esi),%eax
- shrl $31,%edx
- movl %ebp,36(%esp,%ecx,4)
- movl %edx,40(%esp,%ecx,4)
- mull %edi
- addl 32(%esp),%eax
- movl %ecx,%ebx
- adcl $0,%edx
- movl 4(%esi),%eax
- movl $1,%ecx
-.align 4,0x90
-L0143rdmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- movl 4(%esi,%ecx,4),%eax
- adcl $0,%edx
- movl %ebp,28(%esp,%ecx,4)
- movl %edx,%ebp
- mull %edi
- addl 36(%esp,%ecx,4),%ebp
- leal 2(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,24(%esp,%ecx,4)
- jl L0143rdmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- adcl $0,%edx
- movl %ebp,28(%esp,%ebx,4)
- movl 12(%esp),%ecx
- xorl %eax,%eax
- movl 8(%esp),%esi
- addl 36(%esp,%ebx,4),%edx
- adcl 40(%esp,%ebx,4),%eax
- movl %edx,32(%esp,%ebx,4)
- cmpl %ebx,%ecx
- movl %eax,36(%esp,%ebx,4)
- je L008common_tail
- movl 4(%esi,%ecx,4),%edi
- leal 1(%ecx),%ecx
- movl %edi,%eax
- movl %ecx,12(%esp)
- mull %edi
- addl 32(%esp,%ecx,4),%eax
- adcl $0,%edx
- movl %eax,32(%esp,%ecx,4)
- xorl %ebp,%ebp
- cmpl %ebx,%ecx
- leal 1(%ecx),%ecx
- je L015sqrlast
- movl %edx,%ebx
- shrl $1,%edx
- andl $1,%ebx
-.align 4,0x90
-L016sqradd:
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- leal (%eax,%eax,1),%ebp
- adcl $0,%edx
- shrl $31,%eax
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%eax
- addl %ebx,%ebp
- adcl $0,%eax
- cmpl (%esp),%ecx
- movl %ebp,28(%esp,%ecx,4)
- movl %eax,%ebx
- jle L016sqradd
- movl %edx,%ebp
- addl %edx,%edx
- shrl $31,%ebp
- addl %ebx,%edx
- adcl $0,%ebp
-L015sqrlast:
- movl 20(%esp),%edi
- movl 16(%esp),%esi
- imull 32(%esp),%edi
- addl 32(%esp,%ecx,4),%edx
- movl (%esi),%eax
- adcl $0,%ebp
- movl %edx,32(%esp,%ecx,4)
- movl %ebp,36(%esp,%ecx,4)
- mull %edi
- addl 32(%esp),%eax
- leal -1(%ecx),%ebx
- adcl $0,%edx
- movl $1,%ecx
- movl 4(%esi),%eax
- jmp L0143rdmadd
-.align 4,0x90
-L008common_tail:
+L006common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -442,19 +183,19 @@
movl %ebx,%ecx
xorl %edx,%edx
.align 4,0x90
-L017sub:
+L007sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge L017sub
+ jge L007sub
sbbl $0,%eax
movl $-1,%edx
xorl %eax,%edx
- jmp L018copy
+ jmp L008copy
.align 4,0x90
-L018copy:
+L008copy:
movl 32(%esp,%ebx,4),%esi
movl (%edi,%ebx,4),%ebp
movl %ecx,32(%esp,%ebx,4)
@@ -463,7 +204,7 @@
orl %esi,%ebp
movl %ebp,(%edi,%ebx,4)
decl %ebx
- jge L018copy
+ jge L008copy
movl 24(%esp),%esp
movl $1,%eax
L000just_leave:
@@ -477,8 +218,4 @@
.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
.byte 111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/x86-mont-linux.S b/gen/bcm/x86-mont-linux.S
index e6b4ef5..3d3ddb5 100644
--- a/gen/bcm/x86-mont-linux.S
+++ b/gen/bcm/x86-mont-linux.S
@@ -63,12 +63,6 @@
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %edx,24(%esp)
- call .L003PIC_me_up
-.L003PIC_me_up:
- popl %eax
- leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc .L004non_sse2
movl $-1,%eax
movd %eax,%mm7
movl 8(%esp),%esi
@@ -92,7 +86,7 @@
psrlq $32,%mm3
incl %ecx
.align 16
-.L0051st:
+.L0031st:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -107,7 +101,7 @@
psrlq $32,%mm3
leal 1(%ecx),%ecx
cmpl %ebx,%ecx
- jl .L0051st
+ jl .L0031st
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -121,7 +115,7 @@
paddq %mm2,%mm3
movq %mm3,32(%esp,%ebx,4)
incl %edx
-.L006outer:
+.L004outer:
xorl %ecx,%ecx
movd (%edi,%edx,4),%mm4
movd (%esi),%mm5
@@ -143,7 +137,7 @@
paddq %mm6,%mm2
incl %ecx
decl %ebx
-.L007inner:
+.L005inner:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -160,7 +154,7 @@
paddq %mm6,%mm2
decl %ebx
leal 1(%ecx),%ecx
- jnz .L007inner
+ jnz .L005inner
movl %ecx,%ebx
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
@@ -178,264 +172,11 @@
movq %mm3,32(%esp,%ebx,4)
leal 1(%edx),%edx
cmpl %ebx,%edx
- jle .L006outer
+ jle .L004outer
emms
- jmp .L008common_tail
+ jmp .L006common_tail
.align 16
-.L004non_sse2:
- movl 8(%esp),%esi
- leal 1(%ebx),%ebp
- movl 12(%esp),%edi
- xorl %ecx,%ecx
- movl %esi,%edx
- andl $1,%ebp
- subl %edi,%edx
- leal 4(%edi,%ebx,4),%eax
- orl %edx,%ebp
- movl (%edi),%edi
- jz .L009bn_sqr_mont
- movl %eax,28(%esp)
- movl (%esi),%eax
- xorl %edx,%edx
-.align 16
-.L010mull:
- movl %edx,%ebp
- mull %edi
- addl %eax,%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- movl (%esi,%ecx,4),%eax
- cmpl %ebx,%ecx
- movl %ebp,28(%esp,%ecx,4)
- jl .L010mull
- movl %edx,%ebp
- mull %edi
- movl 20(%esp),%edi
- addl %ebp,%eax
- movl 16(%esp),%esi
- adcl $0,%edx
- imull 32(%esp),%edi
- movl %eax,32(%esp,%ebx,4)
- xorl %ecx,%ecx
- movl %edx,36(%esp,%ebx,4)
- movl %ecx,40(%esp,%ebx,4)
- movl (%esi),%eax
- mull %edi
- addl 32(%esp),%eax
- movl 4(%esi),%eax
- adcl $0,%edx
- incl %ecx
- jmp .L0112ndmadd
-.align 16
-.L0121stmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,28(%esp,%ecx,4)
- jl .L0121stmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%eax
- movl 20(%esp),%edi
- adcl $0,%edx
- movl 16(%esp),%esi
- addl %eax,%ebp
- adcl $0,%edx
- imull 32(%esp),%edi
- xorl %ecx,%ecx
- addl 36(%esp,%ebx,4),%edx
- movl %ebp,32(%esp,%ebx,4)
- adcl $0,%ecx
- movl (%esi),%eax
- movl %edx,36(%esp,%ebx,4)
- movl %ecx,40(%esp,%ebx,4)
- mull %edi
- addl 32(%esp),%eax
- movl 4(%esi),%eax
- adcl $0,%edx
- movl $1,%ecx
-.align 16
-.L0112ndmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,24(%esp,%ecx,4)
- jl .L0112ndmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- adcl $0,%edx
- movl %ebp,28(%esp,%ebx,4)
- xorl %eax,%eax
- movl 12(%esp),%ecx
- addl 36(%esp,%ebx,4),%edx
- adcl 40(%esp,%ebx,4),%eax
- leal 4(%ecx),%ecx
- movl %edx,32(%esp,%ebx,4)
- cmpl 28(%esp),%ecx
- movl %eax,36(%esp,%ebx,4)
- je .L008common_tail
- movl (%ecx),%edi
- movl 8(%esp),%esi
- movl %ecx,12(%esp)
- xorl %ecx,%ecx
- xorl %edx,%edx
- movl (%esi),%eax
- jmp .L0121stmadd
-.align 16
-.L009bn_sqr_mont:
- movl %ebx,(%esp)
- movl %ecx,12(%esp)
- movl %edi,%eax
- mull %edi
- movl %eax,32(%esp)
- movl %edx,%ebx
- shrl $1,%edx
- andl $1,%ebx
- incl %ecx
-.align 16
-.L013sqr:
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- leal 1(%ecx),%ecx
- adcl $0,%edx
- leal (%ebx,%eax,2),%ebp
- shrl $31,%eax
- cmpl (%esp),%ecx
- movl %eax,%ebx
- movl %ebp,28(%esp,%ecx,4)
- jl .L013sqr
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- movl 20(%esp),%edi
- adcl $0,%edx
- movl 16(%esp),%esi
- leal (%ebx,%eax,2),%ebp
- imull 32(%esp),%edi
- shrl $31,%eax
- movl %ebp,32(%esp,%ecx,4)
- leal (%eax,%edx,2),%ebp
- movl (%esi),%eax
- shrl $31,%edx
- movl %ebp,36(%esp,%ecx,4)
- movl %edx,40(%esp,%ecx,4)
- mull %edi
- addl 32(%esp),%eax
- movl %ecx,%ebx
- adcl $0,%edx
- movl 4(%esi),%eax
- movl $1,%ecx
-.align 16
-.L0143rdmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- movl 4(%esi,%ecx,4),%eax
- adcl $0,%edx
- movl %ebp,28(%esp,%ecx,4)
- movl %edx,%ebp
- mull %edi
- addl 36(%esp,%ecx,4),%ebp
- leal 2(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,24(%esp,%ecx,4)
- jl .L0143rdmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- adcl $0,%edx
- movl %ebp,28(%esp,%ebx,4)
- movl 12(%esp),%ecx
- xorl %eax,%eax
- movl 8(%esp),%esi
- addl 36(%esp,%ebx,4),%edx
- adcl 40(%esp,%ebx,4),%eax
- movl %edx,32(%esp,%ebx,4)
- cmpl %ebx,%ecx
- movl %eax,36(%esp,%ebx,4)
- je .L008common_tail
- movl 4(%esi,%ecx,4),%edi
- leal 1(%ecx),%ecx
- movl %edi,%eax
- movl %ecx,12(%esp)
- mull %edi
- addl 32(%esp,%ecx,4),%eax
- adcl $0,%edx
- movl %eax,32(%esp,%ecx,4)
- xorl %ebp,%ebp
- cmpl %ebx,%ecx
- leal 1(%ecx),%ecx
- je .L015sqrlast
- movl %edx,%ebx
- shrl $1,%edx
- andl $1,%ebx
-.align 16
-.L016sqradd:
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- leal (%eax,%eax,1),%ebp
- adcl $0,%edx
- shrl $31,%eax
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%eax
- addl %ebx,%ebp
- adcl $0,%eax
- cmpl (%esp),%ecx
- movl %ebp,28(%esp,%ecx,4)
- movl %eax,%ebx
- jle .L016sqradd
- movl %edx,%ebp
- addl %edx,%edx
- shrl $31,%ebp
- addl %ebx,%edx
- adcl $0,%ebp
-.L015sqrlast:
- movl 20(%esp),%edi
- movl 16(%esp),%esi
- imull 32(%esp),%edi
- addl 32(%esp,%ecx,4),%edx
- movl (%esi),%eax
- adcl $0,%ebp
- movl %edx,32(%esp,%ecx,4)
- movl %ebp,36(%esp,%ecx,4)
- mull %edi
- addl 32(%esp),%eax
- leal -1(%ecx),%ebx
- adcl $0,%edx
- movl $1,%ecx
- movl 4(%esi),%eax
- jmp .L0143rdmadd
-.align 16
-.L008common_tail:
+.L006common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -443,19 +184,19 @@
movl %ebx,%ecx
xorl %edx,%edx
.align 16
-.L017sub:
+.L007sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge .L017sub
+ jge .L007sub
sbbl $0,%eax
movl $-1,%edx
xorl %eax,%edx
- jmp .L018copy
+ jmp .L008copy
.align 16
-.L018copy:
+.L008copy:
movl 32(%esp,%ebx,4),%esi
movl (%edi,%ebx,4),%ebp
movl %ecx,32(%esp,%ebx,4)
@@ -464,7 +205,7 @@
orl %esi,%ebp
movl %ebp,(%edi,%ebx,4)
decl %ebx
- jge .L018copy
+ jge .L008copy
movl 24(%esp),%esp
movl $1,%eax
.L000just_leave:
diff --git a/gen/bcm/x86-mont-win.asm b/gen/bcm/x86-mont-win.asm
index cd77529..931275d 100644
--- a/gen/bcm/x86-mont-win.asm
+++ b/gen/bcm/x86-mont-win.asm
@@ -13,7 +13,6 @@
%else
section .text code
%endif
-;extern _OPENSSL_ia32cap_P
global _bn_mul_mont
align 16
_bn_mul_mont:
@@ -70,9 +69,6 @@
mov DWORD [20+esp],esi
lea ebx,[edi-3]
mov DWORD [24+esp],edx
- lea eax,[_OPENSSL_ia32cap_P]
- bt DWORD [eax],26
- jnc NEAR L$003non_sse2
mov eax,-1
movd mm7,eax
mov esi,DWORD [8+esp]
@@ -96,7 +92,7 @@
psrlq mm3,32
inc ecx
align 16
-L$0041st:
+L$0031st:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -111,7 +107,7 @@
psrlq mm3,32
lea ecx,[1+ecx]
cmp ecx,ebx
- jl NEAR L$0041st
+ jl NEAR L$0031st
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -125,7 +121,7 @@
paddq mm3,mm2
movq [32+ebx*4+esp],mm3
inc edx
-L$005outer:
+L$004outer:
xor ecx,ecx
movd mm4,DWORD [edx*4+edi]
movd mm5,DWORD [esi]
@@ -147,7 +143,7 @@
paddq mm2,mm6
inc ecx
dec ebx
-L$006inner:
+L$005inner:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -164,7 +160,7 @@
paddq mm2,mm6
dec ebx
lea ecx,[1+ecx]
- jnz NEAR L$006inner
+ jnz NEAR L$005inner
mov ebx,ecx
pmuludq mm0,mm4
pmuludq mm1,mm5
@@ -182,264 +178,11 @@
movq [32+ebx*4+esp],mm3
lea edx,[1+edx]
cmp edx,ebx
- jle NEAR L$005outer
+ jle NEAR L$004outer
emms
- jmp NEAR L$007common_tail
+ jmp NEAR L$006common_tail
align 16
-L$003non_sse2:
- mov esi,DWORD [8+esp]
- lea ebp,[1+ebx]
- mov edi,DWORD [12+esp]
- xor ecx,ecx
- mov edx,esi
- and ebp,1
- sub edx,edi
- lea eax,[4+ebx*4+edi]
- or ebp,edx
- mov edi,DWORD [edi]
- jz NEAR L$008bn_sqr_mont
- mov DWORD [28+esp],eax
- mov eax,DWORD [esi]
- xor edx,edx
-align 16
-L$009mull:
- mov ebp,edx
- mul edi
- add ebp,eax
- lea ecx,[1+ecx]
- adc edx,0
- mov eax,DWORD [ecx*4+esi]
- cmp ecx,ebx
- mov DWORD [28+ecx*4+esp],ebp
- jl NEAR L$009mull
- mov ebp,edx
- mul edi
- mov edi,DWORD [20+esp]
- add eax,ebp
- mov esi,DWORD [16+esp]
- adc edx,0
- imul edi,DWORD [32+esp]
- mov DWORD [32+ebx*4+esp],eax
- xor ecx,ecx
- mov DWORD [36+ebx*4+esp],edx
- mov DWORD [40+ebx*4+esp],ecx
- mov eax,DWORD [esi]
- mul edi
- add eax,DWORD [32+esp]
- mov eax,DWORD [4+esi]
- adc edx,0
- inc ecx
- jmp NEAR L$0102ndmadd
-align 16
-L$0111stmadd:
- mov ebp,edx
- mul edi
- add ebp,DWORD [32+ecx*4+esp]
- lea ecx,[1+ecx]
- adc edx,0
- add ebp,eax
- mov eax,DWORD [ecx*4+esi]
- adc edx,0
- cmp ecx,ebx
- mov DWORD [28+ecx*4+esp],ebp
- jl NEAR L$0111stmadd
- mov ebp,edx
- mul edi
- add eax,DWORD [32+ebx*4+esp]
- mov edi,DWORD [20+esp]
- adc edx,0
- mov esi,DWORD [16+esp]
- add ebp,eax
- adc edx,0
- imul edi,DWORD [32+esp]
- xor ecx,ecx
- add edx,DWORD [36+ebx*4+esp]
- mov DWORD [32+ebx*4+esp],ebp
- adc ecx,0
- mov eax,DWORD [esi]
- mov DWORD [36+ebx*4+esp],edx
- mov DWORD [40+ebx*4+esp],ecx
- mul edi
- add eax,DWORD [32+esp]
- mov eax,DWORD [4+esi]
- adc edx,0
- mov ecx,1
-align 16
-L$0102ndmadd:
- mov ebp,edx
- mul edi
- add ebp,DWORD [32+ecx*4+esp]
- lea ecx,[1+ecx]
- adc edx,0
- add ebp,eax
- mov eax,DWORD [ecx*4+esi]
- adc edx,0
- cmp ecx,ebx
- mov DWORD [24+ecx*4+esp],ebp
- jl NEAR L$0102ndmadd
- mov ebp,edx
- mul edi
- add ebp,DWORD [32+ebx*4+esp]
- adc edx,0
- add ebp,eax
- adc edx,0
- mov DWORD [28+ebx*4+esp],ebp
- xor eax,eax
- mov ecx,DWORD [12+esp]
- add edx,DWORD [36+ebx*4+esp]
- adc eax,DWORD [40+ebx*4+esp]
- lea ecx,[4+ecx]
- mov DWORD [32+ebx*4+esp],edx
- cmp ecx,DWORD [28+esp]
- mov DWORD [36+ebx*4+esp],eax
- je NEAR L$007common_tail
- mov edi,DWORD [ecx]
- mov esi,DWORD [8+esp]
- mov DWORD [12+esp],ecx
- xor ecx,ecx
- xor edx,edx
- mov eax,DWORD [esi]
- jmp NEAR L$0111stmadd
-align 16
-L$008bn_sqr_mont:
- mov DWORD [esp],ebx
- mov DWORD [12+esp],ecx
- mov eax,edi
- mul edi
- mov DWORD [32+esp],eax
- mov ebx,edx
- shr edx,1
- and ebx,1
- inc ecx
-align 16
-L$012sqr:
- mov eax,DWORD [ecx*4+esi]
- mov ebp,edx
- mul edi
- add eax,ebp
- lea ecx,[1+ecx]
- adc edx,0
- lea ebp,[eax*2+ebx]
- shr eax,31
- cmp ecx,DWORD [esp]
- mov ebx,eax
- mov DWORD [28+ecx*4+esp],ebp
- jl NEAR L$012sqr
- mov eax,DWORD [ecx*4+esi]
- mov ebp,edx
- mul edi
- add eax,ebp
- mov edi,DWORD [20+esp]
- adc edx,0
- mov esi,DWORD [16+esp]
- lea ebp,[eax*2+ebx]
- imul edi,DWORD [32+esp]
- shr eax,31
- mov DWORD [32+ecx*4+esp],ebp
- lea ebp,[edx*2+eax]
- mov eax,DWORD [esi]
- shr edx,31
- mov DWORD [36+ecx*4+esp],ebp
- mov DWORD [40+ecx*4+esp],edx
- mul edi
- add eax,DWORD [32+esp]
- mov ebx,ecx
- adc edx,0
- mov eax,DWORD [4+esi]
- mov ecx,1
-align 16
-L$0133rdmadd:
- mov ebp,edx
- mul edi
- add ebp,DWORD [32+ecx*4+esp]
- adc edx,0
- add ebp,eax
- mov eax,DWORD [4+ecx*4+esi]
- adc edx,0
- mov DWORD [28+ecx*4+esp],ebp
- mov ebp,edx
- mul edi
- add ebp,DWORD [36+ecx*4+esp]
- lea ecx,[2+ecx]
- adc edx,0
- add ebp,eax
- mov eax,DWORD [ecx*4+esi]
- adc edx,0
- cmp ecx,ebx
- mov DWORD [24+ecx*4+esp],ebp
- jl NEAR L$0133rdmadd
- mov ebp,edx
- mul edi
- add ebp,DWORD [32+ebx*4+esp]
- adc edx,0
- add ebp,eax
- adc edx,0
- mov DWORD [28+ebx*4+esp],ebp
- mov ecx,DWORD [12+esp]
- xor eax,eax
- mov esi,DWORD [8+esp]
- add edx,DWORD [36+ebx*4+esp]
- adc eax,DWORD [40+ebx*4+esp]
- mov DWORD [32+ebx*4+esp],edx
- cmp ecx,ebx
- mov DWORD [36+ebx*4+esp],eax
- je NEAR L$007common_tail
- mov edi,DWORD [4+ecx*4+esi]
- lea ecx,[1+ecx]
- mov eax,edi
- mov DWORD [12+esp],ecx
- mul edi
- add eax,DWORD [32+ecx*4+esp]
- adc edx,0
- mov DWORD [32+ecx*4+esp],eax
- xor ebp,ebp
- cmp ecx,ebx
- lea ecx,[1+ecx]
- je NEAR L$014sqrlast
- mov ebx,edx
- shr edx,1
- and ebx,1
-align 16
-L$015sqradd:
- mov eax,DWORD [ecx*4+esi]
- mov ebp,edx
- mul edi
- add eax,ebp
- lea ebp,[eax*1+eax]
- adc edx,0
- shr eax,31
- add ebp,DWORD [32+ecx*4+esp]
- lea ecx,[1+ecx]
- adc eax,0
- add ebp,ebx
- adc eax,0
- cmp ecx,DWORD [esp]
- mov DWORD [28+ecx*4+esp],ebp
- mov ebx,eax
- jle NEAR L$015sqradd
- mov ebp,edx
- add edx,edx
- shr ebp,31
- add edx,ebx
- adc ebp,0
-L$014sqrlast:
- mov edi,DWORD [20+esp]
- mov esi,DWORD [16+esp]
- imul edi,DWORD [32+esp]
- add edx,DWORD [32+ecx*4+esp]
- mov eax,DWORD [esi]
- adc ebp,0
- mov DWORD [32+ecx*4+esp],edx
- mov DWORD [36+ecx*4+esp],ebp
- mul edi
- add eax,DWORD [32+esp]
- lea ebx,[ecx-1]
- adc edx,0
- mov ecx,1
- mov eax,DWORD [4+esi]
- jmp NEAR L$0133rdmadd
-align 16
-L$007common_tail:
+L$006common_tail:
mov ebp,DWORD [16+esp]
mov edi,DWORD [4+esp]
lea esi,[32+esp]
@@ -447,19 +190,19 @@
mov ecx,ebx
xor edx,edx
align 16
-L$016sub:
+L$007sub:
sbb eax,DWORD [edx*4+ebp]
mov DWORD [edx*4+edi],eax
dec ecx
mov eax,DWORD [4+edx*4+esi]
lea edx,[1+edx]
- jge NEAR L$016sub
+ jge NEAR L$007sub
sbb eax,0
mov edx,-1
xor edx,eax
- jmp NEAR L$017copy
+ jmp NEAR L$008copy
align 16
-L$017copy:
+L$008copy:
mov esi,DWORD [32+ebx*4+esp]
mov ebp,DWORD [ebx*4+edi]
mov DWORD [32+ebx*4+esp],ecx
@@ -468,7 +211,7 @@
or ebp,esi
mov DWORD [ebx*4+edi],ebp
dec ebx
- jge NEAR L$017copy
+ jge NEAR L$008copy
mov esp,DWORD [24+esp]
mov eax,1
L$000just_leave:
@@ -482,8 +225,6 @@
db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
db 111,114,103,62,0
-segment .bss
-common _OPENSSL_ia32cap_P 16
%else
; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
ret