[aesni|sha*]-mb-x86_64.pl: add data prefetching.

3847d15d · Andy Polyakov · 3ef477c6 · 3847d15d · 3847d15d · 3847d15d
3 changed file
--- a/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -15,8 +15,8 @@
 #			asymptotic	measured
 #			---------------------------
 # Westmere		5.00/4=1.25	5.13/4=1.28
-# Atom			15.0/4=3.75	15.7/4=3.93
-# Sandy Bridge		5.06/4=1.27	5.15/4=1.29
+# Atom			15.0/4=3.75	?15.7/4=3.93
+# Sandy Bridge		5.06/4=1.27	5.18/4=1.29
 # Ivy Bridge		5.06/4=1.27	5.14/4=1.29
 # Haswell		4.44/4=1.11	4.44/4=1.11
 # Bulldozer		5.75/4=1.44	5.76/4=1.44
@@ -27,8 +27,8 @@
 #
 #			asymptotic	measured
 #			---------------------------
-# Sandy Bridge		5.06/8=0.64	7.05/8=0.88(*)
-# Ivy Bridge		5.06/8=0.64	7.02/8=0.88(*)
+# Sandy Bridge		5.06/8=0.64	7.10/8=0.89(*)
+# Ivy Bridge		5.06/8=0.64	7.14/8=0.89(*)
 # Haswell		5.00/8=0.63	5.00/8=0.63
 # Bulldozer		5.75/8=0.72	5.77/8=0.72
 #
@@ -188,7 +188,11 @@ $code.=<<___;
 	sub	$offset,$sink

 	aesenc		$rndkey1,@out[0]
+	prefetcht0	31(@inptr[0],$offset)	# prefetch input
+	prefetcht0	31(@inptr[1],$offset)
 	aesenc		$rndkey1,@out[1]
+	prefetcht0	31(@inptr[2],$offset)
+	prefetcht0	31(@inptr[2],$offset)
 	aesenc		$rndkey1,@out[2]
 	aesenc		$rndkey1,@out[3]
 	movups		0x30-0x78($key),$rndkey1
@@ -199,8 +203,8 @@ $code.=<<___;
 	 cmp		`32+4*$i`(%rsp),$one
 	aesenc		$rndkey,@out[0]
 	aesenc		$rndkey,@out[1]
-	 cmovge		$sink,@inptr[$i]	# cancel input
 	aesenc		$rndkey,@out[2]
+	 cmovge		$sink,@inptr[$i]	# cancel input
 	 cmovg		$sink,@outptr[$i]	# sink output
 	aesenc		$rndkey,@out[3]
 	movups		`0x40+16*$i-0x78`($key),$rndkey
@@ -209,7 +213,11 @@ ___
 $code.=<<___;
 	 movdqa		$counters,$mask
 	aesenc		$rndkey0,@out[0]
+	prefetcht0	15(@outptr[0],$offset)	# prefetch output
+	prefetcht0	15(@outptr[1],$offset)
 	aesenc		$rndkey0,@out[1]
+	prefetcht0	15(@outptr[2],$offset)
+	prefetcht0	15(@outptr[3],$offset)
 	aesenc		$rndkey0,@out[2]
 	aesenc		$rndkey0,@out[3]
 	movups		0x80-0x78($key),$rndkey0
@@ -260,13 +268,15 @@ $code.=<<___;
 	aesenc		$rndkey0,@out[2]
 	aesenc		$rndkey0,@out[3]
 	movups		0xe0-0x78($key),$rndkey0
+	jmp	.Lenc4x_tail

+.align	32
 .Lenc4x_tail:
 	aesenc		$rndkey1,@out[0]
 	aesenc		$rndkey1,@out[1]
 	aesenc		$rndkey1,@out[2]
-	 movdqu		(@inptr[0],$offset),@inp[0]
 	aesenc		$rndkey1,@out[3]
+	 movdqu		(@inptr[0],$offset),@inp[0]
 	movdqu		0x10-0x78($key),$rndkey1

 	aesenclast	$rndkey0,@out[0]
@@ -426,7 +436,11 @@ $code.=<<___;
 	sub	$offset,$sink

 	aesdec		$rndkey1,@out[0]
+	prefetcht0	31(@inptr[0],$offset)	# prefetch input
+	prefetcht0	31(@inptr[1],$offset)
 	aesdec		$rndkey1,@out[1]
+	prefetcht0	31(@inptr[2],$offset)
+	prefetcht0	31(@inptr[3],$offset)
 	aesdec		$rndkey1,@out[2]
 	aesdec		$rndkey1,@out[3]
 	movups		0x30-0x78($key),$rndkey1
@@ -447,7 +461,11 @@ ___
 $code.=<<___;
 	 movdqa		$counters,$mask
 	aesdec		$rndkey0,@out[0]
+	prefetcht0	15(@outptr[0],$offset)	# prefetch output
+	prefetcht0	15(@outptr[1],$offset)
 	aesdec		$rndkey0,@out[1]
+	prefetcht0	15(@outptr[2],$offset)
+	prefetcht0	15(@outptr[3],$offset)
 	aesdec		$rndkey0,@out[2]
 	aesdec		$rndkey0,@out[3]
 	movups		0x80-0x78($key),$rndkey0
@@ -498,7 +516,9 @@ $code.=<<___;
 	aesdec		$rndkey0,@out[2]
 	aesdec		$rndkey0,@out[3]
 	movups		0xe0-0x78($key),$rndkey0
+	jmp	.Ldec4x_tail

+.align	32
 .Ldec4x_tail:
 	aesdec		$rndkey1,@out[0]
 	aesdec		$rndkey1,@out[1]
@@ -512,12 +532,12 @@ $code.=<<___;
 	movdqu		0x20-0x78($key),$rndkey0

 	aesdeclast	@inp[0],@out[0]
-	 movdqu		-16(@inptr[0],$offset),@inp[0]	# load next IV
 	aesdeclast	@inp[1],@out[1]
+	 movdqu		-16(@inptr[0],$offset),@inp[0]	# load next IV
 	 movdqu		-16(@inptr[1],$offset),@inp[1]
 	aesdeclast	@inp[2],@out[2]
-	 movdqu		-16(@inptr[2],$offset),@inp[2]
 	aesdeclast	@inp[3],@out[3]
+	 movdqu		-16(@inptr[2],$offset),@inp[2]
 	 movdqu		-16(@inptr[3],$offset),@inp[3]

 	movups		@out[0],-16(@outptr[0],$offset)
@@ -682,7 +702,13 @@ $code.=<<___ if ($i);
 ___
 $code.=<<___;
 	vaesenc		$rndkey,@out[1],@out[1]
+	prefetcht0	31(@ptr[$i])			# prefetch input
 	vaesenc		$rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+	prefetcht0	15(@ptr[$i-2])			# prefetch output
+___
+$code.=<<___;
 	vaesenc		$rndkey,@out[3],@out[3]
 	 lea		(@ptr[$i],$offset),$offset
 	 cmovge		%rsp,@ptr[$i]			# cancel input
@@ -703,6 +729,8 @@ ___
 }
 $code.=<<___;
 	 vmovdqu	32(%rsp),$counters
+	prefetcht0	15(@ptr[$i-2])			# prefetch output
+	prefetcht0	15(@ptr[$i-1])
 	cmp	\$11,$rounds
 	jb	.Lenc8x_tail

@@ -958,7 +986,13 @@ $code.=<<___ if ($i);
 ___
 $code.=<<___;
 	vaesdec		$rndkey,@out[1],@out[1]
+	prefetcht0	31(@ptr[$i])			# prefetch input
 	vaesdec		$rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+	prefetcht0	15(@ptr[$i-2])			# prefetch output
+___
+$code.=<<___;
 	vaesdec		$rndkey,@out[3],@out[3]
 	 lea		(@ptr[$i],$offset),$offset
 	 cmovge		%rsp,@ptr[$i]			# cancel input
@@ -979,6 +1013,8 @@ ___
 }
 $code.=<<___;
 	 vmovdqu	32(%rsp),$counters
+	prefetcht0	15(@ptr[$i-2])			# prefetch output
+	prefetcht0	15(@ptr[$i-1])
 	cmp	\$11,$rounds
 	jb	.Ldec8x_tail


--- a/crypto/sha/asm/sha1-mb-x86_64.pl
+++ b/crypto/sha/asm/sha1-mb-x86_64.pl
@@ -14,20 +14,21 @@
 #
 #		this	+aesni(i)	sha1	aesni-sha1	gain(iv)
 # -------------------------------------------------------------------
-# Westmere(ii)	10.4/n	+1.28=3.88(n=4)	5.44	6.58		+70%
-# Atom(ii)	18.9/n	+3.93=8.66(n=4)	10.0	14.0		+62%
+# Westmere(ii)	10.7/n	+1.28=3.96(n=4)	5.30	6.66		+68%
+# Atom(ii)	18.9?/n	+3.93=8.66(n=4)	10.0	14.0		+62%
 # Sandy Bridge	(8.16	+5.15=13.3)/n	4.99	5.98		+80%
-# Ivy Bridge	(8.03	+5.14=13.2)/n	4.60	5.54		+68%
+# Ivy Bridge	(8.08	+5.14=13.2)/n	4.60	5.54		+68%
 # Haswell(iii)	(8.96	+5.00=14.0)/n	3.57	4.55		+160%
-# Bulldozer	(9.75	+5.76=15.5)/n	5.95	6.37		+64%
+# Bulldozer	(9.76	+5.76=15.5)/n	5.95	6.37		+64%
 #
 # (i)	multi-block CBC encrypt with 128-bit key;
 # (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
 #	because of lower AES-NI instruction throughput;
 # (iii)	"this" is for n=8, when we gather twice as much data, result
-#	for n=4 is 7.98+4.44=12.4;
-# (iv)	improvement coefficients in real-life application are somewhat
-#	lower and range from 30% to 100% (on Haswell);
+#	for n=4 is 8.00+4.44=12.4;
+# (iv)	presented improvement coefficients are asymptotic limits and
+#	in real-life application are somewhat lower, e.g. for 2KB
+#	fragments they range from 30% to 100% (on Haswell);

 $flavour = shift;
 $output  = shift;
@@ -80,6 +81,14 @@ $Tbl="%rbp";
 @Xi=map("%xmm$_",(10..14));
 $K="%xmm15";

+if (1) {
+    # Atom-specific optimization aiming to eliminate pshufb with high
+    # registers [and thus get rid of 48 cycles accumulated penalty] 
+    @Xi=map("%xmm$_",(0..4));
+    ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
+    @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
+}
+
 $REG_SZ=16;

 sub Xi_off {
@@ -139,8 +148,8 @@ $code.=<<___ if ($i<14);			# just load input

 	psrld	\$2,$b
 	paddd	$t2,$e				# e+=rol(a,5)
-	 movd		`4*$j-16*4`(@ptr[2]),$t2
 	 pshufb	$tx,@Xi[1]
+	 movd		`4*$j-16*4`(@ptr[2]),$t2
 	por	$t1,$b				# b=rol(b,30)
 ___
 $code.=<<___ if ($i==14);			# just load input
@@ -152,6 +161,7 @@ $code.=<<___ if ($i==14);			# just load input
 	movdqa	$b,$t1
 	movdqa	$b,$t0
 	pslld	\$5,$t2
+	 prefetcht0	63(@ptr[0])
 	pandn	$d,$t1
 	pand	$c,$t0
 	 punpckldq	$t3,@Xi[1]
@@ -162,14 +172,17 @@ $code.=<<___ if ($i==14);			# just load input
 	psrld	\$27,$t3
 	pxor	$t1,$t0				# Ch(b,c,d)
 	movdqa	$b,$t1
+	 prefetcht0	63(@ptr[1])

 	por	$t3,$t2				# rol(a,5)
 	pslld	\$30,$t1
 	paddd	$t0,$e				# e+=Ch(b,c,d)
+	 prefetcht0	63(@ptr[2])

 	psrld	\$2,$b
 	paddd	$t2,$e				# e+=rol(a,5)
 	 pshufb	$tx,@Xi[1]
+	 prefetcht0	63(@ptr[3])
 	por	$t1,$b				# b=rol(b,30)
 ___
 $code.=<<___ if ($i>=13 && $i<15);
@@ -382,12 +395,12 @@ $code.=<<___;
 	movdqu	0x60($ctx),$D
 	movdqu	0x80($ctx),$E
 	movdqa	0x60($Tbl),$tx			# pbswap_mask
+	movdqa	-0x20($Tbl),$K			# K_00_19
 	jmp	.Loop

 .align	32
 .Loop:
 ___
-$code.="	movdqa	-0x20($Tbl),$K\n";	# K_00_19
 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 $code.="	movdqa	0x00($Tbl),$K\n";	# K_20_39
 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
@@ -434,6 +447,7 @@ $code.=<<___;

 	movdqa	@Xi[0],(%rbx)			# save counters
 	movdqa	0x60($Tbl),$tx			# pbswap_mask
+	movdqa	-0x20($Tbl),$K			# K_00_19
 	dec	$num
 	jnz	.Loop

@@ -551,6 +565,7 @@ $code.=<<___ if ($i<14);
 ___
 $code.=<<___ if ($i==14);
 	vpaddd	$K,$e,$e			# e+=K_00_19
+	 prefetcht0	63(@ptr[0])
 	vpslld	\$5,$a,$t2
 	vpandn	$d,$b,$t1
 	vpand	$c,$b,$t0
@@ -559,14 +574,17 @@ $code.=<<___ if ($i==14);
 	vpaddd	@Xi[0],$e,$e			# e+=X[i]
 	 $vpack		$t3,@Xi[1],@Xi[1]
 	vpsrld	\$27,$a,$t3
+	 prefetcht0	63(@ptr[1])
 	vpxor	$t1,$t0,$t0			# Ch(b,c,d)

 	vpslld	\$30,$b,$t1
 	vpor	$t3,$t2,$t2			# rol(a,5)
+	 prefetcht0	63(@ptr[2])
 	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)

 	vpsrld	\$2,$b,$b
 	vpaddd	$t2,$e,$e			# e+=rol(a,5)
+	 prefetcht0	63(@ptr[3])
 	 vpshufb	$tx,@Xi[1],@Xi[1]
 	vpor	$t1,$b,$b			# b=rol(b,30)
 ___
@@ -580,6 +598,7 @@ $code.=<<___ if ($i>=15);			# apply Xupdate
 	vpaddd	$K,$e,$e			# e+=K_00_19
 	vpslld	\$5,$a,$t2
 	vpandn	$d,$b,$t1
+	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
 	vpand	$c,$b,$t0

 	vmovdqa	@Xi[0],`&Xi_off($i)`
@@ -588,14 +607,17 @@ $code.=<<___ if ($i>=15);			# apply Xupdate
 	vpsrld	\$27,$a,$t3
 	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
 	 vpxor	@Xi[3],@Xi[1],@Xi[1]
+	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`

 	vpslld	\$30,$b,$t1
 	vpor	$t3,$t2,$t2			# rol(a,5)
 	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
+	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
 	 vpsrld	\$31,@Xi[1],$tx
 	 vpaddd	@Xi[1],@Xi[1],@Xi[1]

 	vpsrld	\$2,$b,$b
+	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
 	vpaddd	$t2,$e,$e			# e+=rol(a,5)
 	 vpor	$tx,@Xi[1],@Xi[1]		# rol	\$1,@Xi[1]
 	vpor	$t1,$b,$b			# b=rol(b,30)

--- a/crypto/sha/asm/sha256-mb-x86_64.pl
+++ b/crypto/sha/asm/sha256-mb-x86_64.pl
@@ -15,7 +15,7 @@
 #		this	+aesni(i)	sha256	aesni-sha256	gain(iv)
 # -------------------------------------------------------------------
 # Westmere(ii)	23.3/n	+1.28=7.11(n=4)	12.3	+3.75=16.1	+126%
-# Atom(ii)	39.1/n	+3.93=13.7(n=4)	20.8	+5.69=26.5	+93%
+# Atom(ii)	?39.1/n	+3.93=13.7(n=4)	20.8	+5.69=26.5	+93%
 # Sandy Bridge	(20.5	+5.15=25.7)/n	11.6	13.0		+103%
 # Ivy Bridge	(20.4	+5.14=25.5)/n	10.3	11.6		+82%
 # Haswell(iii)	(21.0	+5.00=26.0)/n	7.80	8.79		+170%
@@ -27,8 +27,9 @@
 #	AES-NI-SHA256 stitch for these processors;
 # (iii)	"this" is for n=8, when we gather twice as much data, result
 #	for n=4 is 20.3+4.44=24.7;
-# (iv)	improvement coefficients in real-life application are somewhat
-#	lower and range from 75% to 130% (on Haswell);
+# (iv)	presented improvement coefficients are asymptotic limits and
+#	in real-life application are somewhat lower, e.g. for 2KB 
+#	fragments they range from 75% to 13% (on Haswell);

 $flavour = shift;
 $output  = shift;
@@ -135,6 +136,7 @@ $code.=<<___;

 	psrld	\$25-11,$t2
 	 movdqa	$e,$t1
+	 `"prefetch	63(@ptr[0])"		if ($i==15)`
 	pxor	$t3,$sigma
 	 movdqa	$e,$axb				# borrow $axb
 	pslld	\$26-21,$t3
@@ -142,6 +144,7 @@ $code.=<<___;
 	 pand	$f,$axb
 	pxor	$t2,$sigma

+	 `"prefetch	63(@ptr[1])"		if ($i==15)`
 	movdqa	$a,$t2
 	pxor	$t3,$sigma			# Sigma1(e)
 	movdqa	$a,$t3
@@ -153,6 +156,7 @@ $code.=<<___;
 	pslld	\$10,$t3
 	 pxor	$a,$axb				# a^b, b^c in next round

+	 `"prefetch	63(@ptr[2])"		if ($i==15)`
 	psrld	\$13,$sigma
 	pxor	$t3,$t2
 	 paddd	$t1,$Xi				# Xi+=Ch(e,f,g)
@@ -160,6 +164,7 @@ $code.=<<___;
 	 pand	$axb,$bxc
 	pxor	$sigma,$t2

+	 `"prefetch	63(@ptr[3])"		if ($i==15)`
 	psrld	\$22-13,$sigma
 	pxor	$t3,$t2
 	 movdqa	$b,$h
@@ -465,30 +470,38 @@ $code.=<<___;

 	vpsrld	\$25,$e,$t2
 	vpxor	$t3,$sigma,$sigma
+	 `"prefetch	63(@ptr[0])"		if ($i==15)`
 	vpslld	\$7,$e,$t3
 	 vpandn	$g,$e,$t1
 	 vpand	$f,$e,$axb			# borrow $axb
+	 `"prefetch	63(@ptr[1])"		if ($i==15)`
 	vpxor	$t2,$sigma,$sigma

 	vpsrld	\$2,$a,$h			# borrow $h
 	vpxor	$t3,$sigma,$sigma		# Sigma1(e)
+	 `"prefetch	63(@ptr[2])"		if ($i==15)`
 	vpslld	\$30,$a,$t2
 	 vpxor	$axb,$t1,$t1			# Ch(e,f,g)
 	 vpxor	$a,$b,$axb			# a^b, b^c in next round
+	 `"prefetch	63(@ptr[3])"		if ($i==15)`
 	vpxor	$t2,$h,$h
 	vpaddd	$sigma,$Xi,$Xi			# Xi+=Sigma1(e)

 	vpsrld	\$13,$a,$t2
+	 `"prefetch	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
 	vpslld	\$19,$a,$t3
 	 vpaddd	$t1,$Xi,$Xi			# Xi+=Ch(e,f,g)
 	 vpand	$axb,$bxc,$bxc
+	 `"prefetch	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
 	vpxor	$t2,$h,$sigma

 	vpsrld	\$22,$a,$t2
 	vpxor	$t3,$sigma,$sigma
+	 `"prefetch	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
 	vpslld	\$10,$a,$t3
 	 vpxor	$bxc,$b,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
 	 vpaddd	$Xi,$d,$d			# d+=Xi
+	 `"prefetch	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
 	vpxor	$t2,$sigma,$sigma
 	vpxor	$t3,$sigma,$sigma		# Sigma0(a)