提交 bc5b136c 编写于 作者: A Andy Polyakov

ghash-x86.pl: optimize for Sandy Bridge.

上级 16cb0d95
...@@ -103,6 +103,16 @@ ...@@ -103,6 +103,16 @@
# providing access to a Westmere-based system on behalf of Intel # providing access to a Westmere-based system on behalf of Intel
# Open Source Technology Centre. # Open Source Technology Centre.
# January 2010
#
# Tweaked to optimize transitions between integer and FP operations
# on same XMM register, PCLMULQDQ subroutine was measured to process
# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
# The minor regression on Westmere is outweighed by ~15% improvement
# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm"); push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl"; require "x86asm.pl";
...@@ -829,8 +839,8 @@ my ($Xhi,$Xi,$Hkey)=@_; ...@@ -829,8 +839,8 @@ my ($Xhi,$Xi,$Hkey)=@_;
&pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); ####### &pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T1,$T2,0x00); ####### &pclmulqdq ($T1,$T2,0x00); #######
&pxor ($T1,$Xi); # &xorps ($T1,$Xi); #
&pxor ($T1,$Xhi); # &xorps ($T1,$Xhi); #
&movdqa ($T2,$T1); # &movdqa ($T2,$T1); #
&psrldq ($T1,8); &psrldq ($T1,8);
...@@ -950,7 +960,7 @@ my ($Xhi,$Xi) = @_; ...@@ -950,7 +960,7 @@ my ($Xhi,$Xi) = @_;
&movdqu ($Xi,&QWP(0,$Xip)); &movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const)); &movdqa ($T3,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl)); &movups ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3); &pshufb ($Xi,$T3);
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
...@@ -993,7 +1003,7 @@ my ($Xhi,$Xi) = @_; ...@@ -993,7 +1003,7 @@ my ($Xhi,$Xi) = @_;
&pxor ($Xi,$T1); # Ii+Xi &pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&lea ($inp,&DWP(32,$inp)); # i+=2 &lea ($inp,&DWP(32,$inp)); # i+=2
&sub ($len,0x20); &sub ($len,0x20);
...@@ -1002,7 +1012,7 @@ my ($Xhi,$Xi) = @_; ...@@ -1002,7 +1012,7 @@ my ($Xhi,$Xi) = @_;
&set_label("mod_loop"); &set_label("mod_loop");
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
&movdqu ($T1,&QWP(0,$inp)); # Ii &movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H &movups ($Hkey,&QWP(0,$Htbl)); # load H
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&pxor ($Xhi,$Xhn); &pxor ($Xhi,$Xhn);
...@@ -1043,9 +1053,9 @@ my ($Xhi,$Xi) = @_; ...@@ -1043,9 +1053,9 @@ my ($Xhi,$Xi) = @_;
&pxor ($Xi,$T2); # &pxor ($Xi,$T2); #
&pclmulqdq ($T1,$T3,0x00); ####### &pclmulqdq ($T1,$T3,0x00); #######
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&pxor ($T1,$Xn); # &xorps ($T1,$Xn); #
&pxor ($T1,$Xhn); # &xorps ($T1,$Xhn); #
&movdqa ($T3,$T1); # &movdqa ($T3,$T1); #
&psrldq ($T1,8); &psrldq ($T1,8);
...@@ -1069,7 +1079,7 @@ my ($Xhi,$Xi) = @_; ...@@ -1069,7 +1079,7 @@ my ($Xhi,$Xi) = @_;
&test ($len,$len); &test ($len,$len);
&jnz (&label("done")); &jnz (&label("done"));
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H &movups ($Hkey,&QWP(0,$Htbl)); # load H
&set_label("odd_tail"); &set_label("odd_tail");
&movdqu ($T1,&QWP(0,$inp)); # Ii &movdqu ($T1,&QWP(0,$inp)); # Ii
&pshufb ($T1,$T3); &pshufb ($T1,$T3);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册