crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.

Avoid occasional up to 8% performance drops.

crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.
Avoid occasional up to 8% performance drops.
7a1a1223 · Andy Polyakov · 72a15870 · 7a1a1223
隐藏空白更改
内联并排

Showing with 28 addition and 6 deletion

crypto/modes/asm/aesni-gcm-x86_64.pl crypto/modes/asm/aesni-gcm-x86_64.pl +28 -6

未找到文件。
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -21,8 +21,8 @@
 # justify. This module is based on combination of Intel submissions,
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
-# pressure with notable relative improvement on upcoming Haswell
+# pressure with notable relative improvement, achieving 1.0 cycle per
-# processor. [Exact performance numbers to be added at launch.]
+# byte processed with 128-bit key on Haswell processor.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -422,17 +422,28 @@ $code.=<<___;
 	vzeroupper
 	vmovdqu		($ivp),$T1		# input counter value
-	sub		\$128,%rsp
+	add		\$-128,%rsp
 	mov		12($ivp),$counter
 	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
 	vmovdqu		($Xip),$Xi		# load Xi
-	and		\$-64,%rsp		# ensure stack alignment
+	and		\$-128,%rsp		# ensure stack alignment
 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
 	lea		0x80($key),$key		# size optimization
 	lea		0x20+0x20($Xip),$Xip	# size optimization
 	mov		0xf0-0x80($key),$rounds
 	vpshufb		$Ii,$Xi,$Xi
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Ldec_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Ldec_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Ldec_no_key_aliasing:
 	vmovdqu		0x50($inp),$Z3		# I[5]
 	lea		($inp),$in0
 	vmovdqu		0x40($inp),$Z0
@@ -621,14 +632,25 @@ $code.=<<___;
 	vzeroupper
 	vmovdqu		($ivp),$T1		# input counter value
-	sub		\$128,%rsp
+	add		\$-128,%rsp
 	mov		12($ivp),$counter
 	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
 	lea		0x80($key),$key		# size optimization
 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
-	and		\$-64,%rsp		# ensure stack alignment
+	and		\$-128,%rsp		# ensure stack alignment
 	mov		0xf0-0x80($key),$rounds
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Lenc_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Lenc_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Lenc_no_key_aliasing:
 	lea		($out),$in0
 	lea		-0xc0($out,$len),$end0
 	shr		\$4,$len