sha1-[586|x86_64].pl: shave off one instruction from body_40_59, it's

2% less instructions in SIMD code paths, so 2% improvement in average:-)

sha1-[586|x86_64].pl: shave off one instruction from body_40_59, it's
2% less instructions in SIMD code paths, so 2% improvement in average:-)
69f45c52 · Andy Polyakov · 7bb98eee · 69f45c52 · 69f45c52
隐藏空白更改
内联并排

Showing with 30 addition and 28 deletion

crypto/sha/asm/sha1-586.pl crypto/sha/asm/sha1-586.pl +15 -14

crypto/sha/asm/sha1-x86_64.pl crypto/sha/asm/sha1-x86_64.pl +15 -14

未找到文件。
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -89,12 +89,12 @@
 # P4		10.6		-
 # AMD K8	7.1		-
 # Core2		7.3		6.1/+20%	-
-# Atom		12.5		9.5(*)/+32%	-
-# Westmere	7.3		5.6/+30%	-
-# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
-# Ivy Bridge	7.2		4.9/+47%	4.8(**)/+50%
-# Bulldozer	11.6		6.2/+88%
-# VIA Nano	10.6		7.5/+41%
+# Atom		12.5		9.3(*)/+35%	-
+# Westmere	7.3		5.5/+33%	-
+# Sandy Bridge	8.8		6.2/+40%	5.2(**)/+70%
+# Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
+# Bulldozer	11.6		6.0/+92%
+# VIA Nano	10.6		7.6/+40%
 #
 # (*)	Loop is 1056 instructions long and expected result is ~8.25.
 #	It remains mystery [to me] why ILP is limited to 1.7.
@@ -616,7 +616,7 @@ sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
 sub Xupdate_ssse3_32_79()
 { use integer;
  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

 	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
@@ -783,17 +783,16 @@ sub body_20_39 () {
 sub body_40_59 () {
 	(
 	'($a,$b,$c,$d,$e)=@V;'.
-	'&mov	(@T[1],$c);',
-	'&xor	($c,$d);',
+	'&xor	(@T[0],$c);',
+	'&xor	(@T[1],$d);',
 	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
-	'&and	(@T[1],$d);',
-	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&and	(@T[0],@T[1]);',
 	'&$_ror	($b,7);',	# $b>>>2
-	'&add	($e,@T[1]);',
+	'&xor	(@T[0],$c);',
 	'&mov	(@T[1],$a);',	# $b in next round
 	'&$_rol	($a,5);',
 	'&add	($e,@T[0]);',
-	'&xor	($c,$d);',	# restore $c
+	'&mov	(@T[0],$b);',	# copy of $c in next round
 	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 	);
 }
@@ -809,6 +808,7 @@ sub body_40_59 () {
 	&Xupdate_ssse3_32_79(\&body_20_39);
 	&Xupdate_ssse3_32_79(\&body_20_39);
 	&Xupdate_ssse3_32_79(\&body_20_39);
+	&mov	(@T[1],@V[2]);	# copy of $c in next round
 	&Xupdate_ssse3_32_79(\&body_40_59);
 	&Xupdate_ssse3_32_79(\&body_40_59);
 	&Xupdate_ssse3_32_79(\&body_40_59);
@@ -1032,7 +1032,7 @@ sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
 sub Xupdate_avx_32_79()
 { use integer;
  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

 	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
@@ -1173,6 +1173,7 @@ sub Xtail_avx()
 	&Xupdate_avx_32_79(\&body_20_39);
 	&Xupdate_avx_32_79(\&body_20_39);
 	&Xupdate_avx_32_79(\&body_20_39);
+	&mov	(@T[1],@V[2]);	# copy of $c in next round
 	&Xupdate_avx_32_79(\&body_40_59);
 	&Xupdate_avx_32_79(\&body_40_59);
 	&Xupdate_avx_32_79(\&body_40_59);

--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -56,12 +56,12 @@
 #		x86_64		SSSE3		AVX
 # P4		9.8		-
 # Opteron	6.6		-
-# Core2		6.7		6.1/+10%	-
-# Atom		11.0		9.7/+13%	-
-# Westmere	7.1		5.6/+27%	-
-# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
-# Ivy Bridge	6.4		4.8/+33%	4.7/+36%
-# Bulldozer	10.9		6.1/+79%
+# Core2		6.7		6.2/+8%		-
+# Atom		11.0		9.5/+15%	-
+# Westmere	7.1		5.5/+29%	-
+# Sandy Bridge	7.9		6.2/+28%	5.1/+54%
+# Ivy Bridge	6.4		4.7/+35%	4.6/+37%
+# Bulldozer	10.9		6.0/+82%
 # VIA Nano	10.2		7.4/+38%

 $flavour = shift;
@@ -453,7 +453,7 @@ sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
 sub Xupdate_ssse3_32_79()
 { use integer;
  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

 	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
@@ -618,17 +618,16 @@ sub body_20_39 () {
 sub body_40_59 () {
 	(
 	'($a,$b,$c,$d,$e)=@V;'.
-	'&mov	(@T[1],$c);',
-	'&xor	($c,$d);',
+	'&xor	(@T[0],$c);',
+	'&xor	(@T[1],$d);',
 	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
-	'&and	(@T[1],$d);',
-	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&and	(@T[0],$T[1]);',
 	'&$_ror	($b,7);',	# $b>>>2
-	'&add	($e,@T[1]);',
+	'&xor	(@T[0],$c);',
 	'&mov	(@T[1],$a);',	# $b in next round
 	'&$_rol	($a,5);',
 	'&add	($e,@T[0]);',
-	'&xor	($c,$d);',	# restore $c
+	'&mov	(@T[0],$b);',	# copy of $c in next round
 	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 	);
 }
@@ -646,6 +645,7 @@ ___
 	&Xupdate_ssse3_32_79(\&body_20_39);
 	&Xupdate_ssse3_32_79(\&body_20_39);
 	&Xupdate_ssse3_32_79(\&body_20_39);
+	&mov	(@T[1],@V[2]);	# copy of $c in next round
 	&Xupdate_ssse3_32_79(\&body_40_59);
 	&Xupdate_ssse3_32_79(\&body_40_59);
 	&Xupdate_ssse3_32_79(\&body_40_59);
@@ -859,7 +859,7 @@ sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
 sub Xupdate_avx_32_79()
 { use integer;
  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
  my ($a,$b,$c,$d,$e);

 	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
@@ -1002,6 +1002,7 @@ ___
 	&Xupdate_avx_32_79(\&body_20_39);
 	&Xupdate_avx_32_79(\&body_20_39);
 	&Xupdate_avx_32_79(\&body_20_39);
+	&mov	(@T[1],@V[2]);	# copy of $c in next round
 	&Xupdate_avx_32_79(\&body_40_59);
 	&Xupdate_avx_32_79(\&body_40_59);
 	&Xupdate_avx_32_79(\&body_40_59);