diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 04001e6aae793ad4d16bd6dce5c0b30743920da5..f1aca3e5343157e4e67792b64b3a303a2e0b175c 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -59,11 +59,11 @@ # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # -# Westmere 1.76(+14%) -# Sandy Bridge 1.79(+9%) -# Ivy Bridge 1.79(+8%) +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) -# Bulldozer 1.52(+25%) +# Bulldozer 1.49(+27%) # March 2013 # @@ -673,8 +673,8 @@ $code.=<<___; pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey2,$Xl pclmulqdq \$0x11,$Hkey2,$Xh - xorps $Xl,$Xln pclmulqdq \$0x10,$HK,$Xm + xorps $Xl,$Xln xorps $Xh,$Xhn movups 0x50($Htbl),$HK xorps $Xm,$Xmn @@ -692,8 +692,8 @@ $code.=<<___; pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x11,$Hkey3,$Xh - xorps $Xl,$Xln pclmulqdq \$0x00,$HK,$Xm + xorps $Xl,$Xln xorps $Xh,$Xhn lea 0x40($inp),$inp @@ -711,23 +711,23 @@ $code.=<<___; xorps $Xln,$Xi movdqu 0x20($inp),$Xln movdqa $Xl,$Xh - pshufd \$0b01001110,$Xl,$Xm pclmulqdq \$0x10,$HK,$T1 + pshufd \$0b01001110,$Xl,$Xm xorps $Xhn,$Xhi pxor $Xl,$Xm pshufb $T3,$Xln movups 0x20($Htbl),$HK - pclmulqdq \$0x00,$Hkey,$Xl xorps $Xmn,$T1 - movdqa $Xln,$Xhn + pclmulqdq \$0x00,$Hkey,$Xl pshufd \$0b01001110,$Xln,$Xmn pxor $Xi,$T1 # aggregated Karatsuba post-processing - pxor $Xln,$Xmn + movdqa $Xln,$Xhn pxor $Xhi,$T1 # + pxor $Xln,$Xmn movdqa $T1,$T2 # - pslldq \$8,$T1 pclmulqdq \$0x11,$Hkey,$Xh + pslldq \$8,$T1 psrldq \$8,$T2 # pxor $T1,$Xi movdqa .L7_mask(%rip),$T1 @@ -736,8 +736,8 @@ $code.=<<___; pand $Xi,$T1 # 1st phase pshufb $T1,$T2 # - pclmulqdq \$0x00,$HK,$Xm pxor $Xi,$T2 # + pclmulqdq \$0x00,$HK,$Xm psllq \$57,$T2 # movdqa $T2,$T1 # pslldq \$8,$T2 @@ -764,32 +764,31 @@ $code.=<<___; movdqa $Xl,$Xh pxor $Xm,$Xmn pshufd \$0b01001110,$Xl,$Xm - pxor $Xl,$Xm - pclmulqdq \$0x00,$Hkey3,$Xl pxor $T2,$Xi # pxor $T1,$Xhi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl psrlq \$1,$Xi # + pxor $Xhi,$Xi # + movdqa $Xi,$Xhi pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln - pxor $Xhi,$Xi # + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 pclmulqdq \$0x00,$HK,$Xm xorps $Xh,$Xhn - movdqa $Xi,$Xhi - pshufd \$0b01001110,$Xi,$T1 - pxor $Xi,$T1 - lea 0x40($inp),$inp sub \$0x40,$len jnc .Lmod4_loop .Ltail4x: pclmulqdq \$0x00,$Hkey4,$Xi - xorps $Xm,$Xmn pclmulqdq \$0x11,$Hkey4,$Xhi - xorps $Xln,$Xi pclmulqdq \$0x10,$HK,$T1 + xorps $Xm,$Xmn + xorps $Xln,$Xi xorps $Xhn,$Xhi pxor $Xi,$Xhi # aggregated Karatsuba post-processing pxor $Xmn,$T1 @@ -852,13 +851,13 @@ $code.=<<___; pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi - movdqu ($inp),$Xhn # Ii + movdqu ($inp),$T2 # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing - pshufb $T3,$Xhn + pshufb $T3,$T2 movdqu 16($inp),$Xln # Ii+1 pxor $Xhi,$T1 - pxor $Xhn,$Xhi # "Ii+Xi", consume early + pxor $T2,$Xhi # "Ii+Xi", consume early pxor $T1,$Xmn pshufb $T3,$Xln movdqa $Xmn,$T1 # @@ -885,9 +884,9 @@ $code.=<<___; pxor $T1,$Xhi # pxor $Xhn,$Xmn # - pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey,$Xhn ####### pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi @@ -896,7 +895,6 @@ $code.=<<___; psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # - .byte 0x66,0x90 sub \$0x20,$len ja .Lmod_loop