Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently

doesn't give performance improvement.

Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.
48d2335d · Andy Polyakov · 96ea4ae9 · 48d2335d
隐藏空白更改
内联并排

Showing with 187 addition and 41 deletion

crypto/bn/asm/x86-mont.pl crypto/bn/asm/x86-mont.pl +187 -41

未找到文件。
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -2,8 +2,9 @@
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
+# project. The module is, however, dual licensed under OpenSSL and
-# forms are granted according to the OpenSSL license.
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 # October 2005
@@ -31,12 +32,12 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &function_begin("bn_mul_mont",$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
-$i="ebx";
+$i="edx";
 $j="ecx";
 $ap="esi";
 $rp="edi";	$bp="edi";		# overlapping variables!!!
-$np="edx";
+$np="ebp";
-$num="ebp";
+$num="ebx";
 $_rp=&DWP(4*0,"esp");			# stack top layout
 $_ap=&DWP(4*1,"esp");
@@ -45,21 +46,13 @@ $_np=&DWP(4*3,"esp");
 $_n0=&DWP(4*4,"esp");
 $_num=&DWP(4*5,"esp");
 $_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
 $frame=32;				# size of above frame rounded up to 16n
-$acc0="mm0";				# mmx register bank layout
+	&xor	("eax","eax");
-$acc1="mm1";
+	&mov	("edi",&wparam(5));	# int num
-$car0="mm2";
+	&cmp	("edi",3);
-$car1="mm3";
+	&jb	(&label("just_leave"));
-$mul0="mm4";
-$mul1="mm5";
-$temp="mm6";
-$mask="mm7";
-if($sse2) {
-	&picmeup("eax","OPENSSL_ia32cap_P");
-	&bt	(&DWP(0,"eax"),26);
-	&jnc	(&label("non_sse2"));
 	################################# load argument block...
 	&mov	("eax",&wparam(0));	# BN_ULONG *rp
@@ -67,16 +60,14 @@ if($sse2) {
 	&mov	("ecx",&wparam(2));	# const BN_ULONG *bp
 	&mov	("edx",&wparam(3));	# const BN_ULONG *np
 	&mov	("esi",&wparam(4));	# const BN_ULONG *n0
-	&mov	($num,&wparam(5));	# int num
+	#&mov	("edi",&wparam(5));	# int num
-	&mov	("edi","esp");		# saved stack pointer!
+	&mov	("ebp","esp");		# saved stack pointer!
-	&add	($num,1);		# extra word on top of tp
+	&add	("edi",2);		# extra two words on top of tp
-	&neg	($num);
+	&neg	("edi");
-	&lea	("esp",&DWP(-$frame,"esp",$num,4));	# alloca($frame+8*($num+1))
+	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
-	&neg	($num);
+	&neg	("edi");
-	&and	("esp",-1024);		# minimize TLB utilization
+	&and	("esp",-4096);		# minimize TLB utilization
-	&sub	($num,1);		# num is restored to its original value
-					# and will remain constant from now...
 	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
 	&mov	($_rp,"eax");		# ... save a copy of argument block
@@ -84,8 +75,23 @@ if($sse2) {
 	&mov	($_bp,"ecx");
 	&mov	($_np,"edx");
 	&mov	($_n0,"esi");
-	#&mov	($_num,$num);		# redundant in sse2 context
+	&lea	($num,&DWP(-2,"edi"));	# num is restored to its original value
-	&mov	($_sp,"edi");		# saved stack pointer!
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"ebp");		# saved stack pointer!
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
 	&mov	("eax",-1);
 	&movd	($mask,"eax");		# mask 32 lower bits
@@ -195,7 +201,153 @@ if($sse2) {
 	&jl	(&label("outer"));
 	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+&set_label("non_sse2",16);
+}
+if (1) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# The code below gives ~15% improvement on 512-bit benchmark
+	# *only*:-( On all other key lengths it's slower for up to 20%.
+	# This is because the original code path holds down the overall
+	# amount of multiplications by ~25% by deploying bn_sqr_words.
+	# In other words, for the code below to be competitive,
+	# dedicated squaring procedure is a must...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+	&sub	($num,1);		# non-SSE2 path uses num-1
+	&mov	($inp,$_ap);
+	&mov	($word,$_bp);
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&mov	($_bpend,"eax");
+	&xor	($j,$j);
+	&xor	("edx","edx");
+&set_label("mull",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&lea	($j,&DWP(1,$j));
+	&add	("eax",$carry);
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),"eax");	# tp[j]=
+	&cmp	($j,$num);
+	&jb	(&label("mull"));
+	&mov	("eax",&DWP(0,$inp,$num,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	&add	("eax",$carry);
+	&adc	("edx",0);
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&adc	("edx",0);
+	&mov	($j,1);
+	&jmp	(&label("2ndmadd"));
+&set_label("1stmadd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&lea	($j,&DWP(1,$j));
+	&add	("eax",&DWP($frame-4,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	("eax",$carry);
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),"eax");	# tp[j]=
+	&cmp	($j,$num);
+	&jb	(&label("1stmadd"));
+	&mov	("eax",&DWP(0,$inp,$num,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	("eax",$carry);
+	&adc	("edx",0);
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&adc	($j,0);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&adc	("edx",0);
+	&mov	($j,1);
+&set_label("2ndmadd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&lea	($j,&DWP(1,$j));
+	&add	("eax",&DWP($frame-4,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	("eax",$carry);
+	&adc	("edx",0);
+	&mov	(&DWP($frame-8,"esp",$j,4),"eax");	# tp[j-1]=
+	&cmp	($j,$num);
+	&jb	(&label("2ndmadd"));
+	&mov	("eax",&DWP(0,$inp,$num,4));		# np[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# np[num-1]*m
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	("eax",$carry);
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),"eax");	# tp[num-2]=
+	&xor	("eax","eax");
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&mov	($carry,$_bp);				# &bp[i]
+	&add	($carry,4);
+	&cmp	($carry,$_bpend);
+	&je	(&label("x86done"));
+	&mov	($word,&DWP(0,$carry));			# bp[i]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$carry);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&jmp	(&label("1stmadd"));
+&set_label("x86done",16);
+	&mov	($np,$_np);	# make adjustments for tail processing
+	&add	($num,1);
+}
+&set_label("common_tail",16);
 	&mov	("esi",&DWP($frame,"esp",$num,4));# load upmost overflow bit
 	&mov	($rp,$_rp);			# load result pointer
 						# [$ap and $bp are zapped]
@@ -206,15 +358,15 @@ if($sse2) {
 	&mov	("eax",&DWP($frame,"esp",$j,4));
 	&cmp	("eax",&DWP(0,$np,$j,4));	# tp[num-1]-np[num-1]?
 	&jae	(&label("sub"));		# if taken CF is cleared
-&set_label("copy");
+&set_label("copy",16);
 	&mov	("eax",&DWP($frame,"esp",$j,4));
 	&mov	(&DWP(0,$rp,$j,4),"eax");	# rp[i]=tp[i]
 	&mov	(&DWP($frame,"esp",$j,4),$j);	# zap temporary vector
 	&dec	($j);
 	&jge	(&label("copy"));
-	&jmp	(&label("exit_sse2"));
+	&jmp	(&label("exit"));
-&set_label("sub",4);
+&set_label("sub",16);
 	&mov	("eax",&DWP($frame,"esp",$i,4));
 	&sbb	("eax",&DWP(0,$np,$i,4));
 	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
@@ -224,21 +376,15 @@ if($sse2) {
 	&lea	($j,&DWP(-1,$num));		# j=num-1
 	&sbb	("esi",0);			# esi holds upmost overflow bit
 	&jc	(&label("copy"));
-&set_label("zap");
+&set_label("zap",16);
 	&mov	(&DWP($frame,"esp",$j,4),$i);	# zap temporary vector
 	&dec	($j);
 	&jge	(&label("zap"));
-&set_label("exit_sse2");
+&set_label("exit",4);
 	&mov	("esp",$_sp);		# pull saved stack pointer
 	&mov	("eax",1);
-	&jmp	(&label("leave"));
+&set_label("just_leave");
-&set_label("non_sse2");
-}
-	&xor	("eax","eax");	# zero signals "not implemented [yet]"
-&set_label("leave");
 &function_end("bn_mul_mont");
 &asm_finish();