提交 c4558efb 编写于 作者: A Andy Polyakov

sha512-x86_64.pl: add AVX2 code path.

上级 750398ac
...@@ -59,6 +59,15 @@ ...@@ -59,6 +59,15 @@
# higher coefficients are observed on VIA Nano and Bulldozer has more # higher coefficients are observed on VIA Nano and Bulldozer has more
# to do with specifics of their architecture [which is topic for # to do with specifics of their architecture [which is topic for
# separate discussion]. # separate discussion].
#
# November 2012.
#
# Add AVX2 code path. Two consecutive input blocks are loaded to
# 256-bit %ymm registers, with data from first block to least
# significant 128-bit halves and data from second to most significant.
# The data is then processed with same SIMD instruction sequence as
# for AVX, but with %ymm as operands. Side effect is increased stack
# frame, 448 additional bytes in SHA256 and 1152 in SHA512.
###################################################################### ######################################################################
# Current performance in cycles per processed byte (less is better): # Current performance in cycles per processed byte (less is better):
...@@ -69,13 +78,13 @@ ...@@ -69,13 +78,13 @@
# P4 17.5 - - 33.4 - # P4 17.5 - - 33.4 -
# Core 2 15.5 13.9(+11%) - 10.3 - # Core 2 15.5 13.9(+11%) - 10.3 -
# Westmere 15.1 12.5(+21%) - 9.72 - # Westmere 15.1 12.5(+21%) - 9.72 -
# Atom 23.0 21.6(+6%) - 14.7 -
# VIA Nano 23.0 16.3(+41%) - 14.7 -
# Sandy Bridge 17.4 14.0(+24%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Sandy Bridge 17.4 14.0(+24%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
# Ivy Bridge 12.6 10.3(+22%) 10.3(+22%) 8.17 7.22(+13%) # Ivy Bridge 12.6 10.3(+22%) 10.3(+22%) 8.17 7.22(+13%)
# Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%) # Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%)
# VIA Nano 23.0 16.3(+41%) - 14.7 -
# Atom 23.0 21.6(+6%) - 14.7 -
# #
# (*) whichever applicable; # (*) whichever best applicable;
# (**) switch from ror to shrd stands for fair share of improvement; # (**) switch from ror to shrd stands for fair share of improvement;
# (***) execution time is fully determined by remaining integer-only # (***) execution time is fully determined by remaining integer-only
# part, body_00_15; reducing the amount of SIMD instructions # part, body_00_15; reducing the amount of SIMD instructions
...@@ -93,15 +102,20 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ...@@ -93,15 +102,20 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl"; die "can't locate x86_64-xlate.pl";
$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/ && =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
$1>=2.19); $avx = ($1>=2.19) + ($1>=2.22);
$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && }
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
$1>=2.09); if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
`ml64 2>&1` =~ /Version ([0-9]+)\./ && $avx = ($1>=2.09) + ($1>=2.10);
$1>=10); }
if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./) {
$avx = ($1>=10) + ($1>=11);
}
open OUT,"| \"$^X\" $xlate $flavour $output"; open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT; *STDOUT=*OUT;
...@@ -145,6 +159,8 @@ $framesz="16*$SZ+4*8"; ...@@ -145,6 +159,8 @@ $framesz="16*$SZ+4*8";
sub ROUND_00_15() sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
my $STRIDE=$SZ;
$STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
$code.=<<___; $code.=<<___;
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
...@@ -186,7 +202,7 @@ $code.=<<___ if ($i>=15); ...@@ -186,7 +202,7 @@ $code.=<<___ if ($i>=15);
mov `$SZ*(($i+2)&0xf)`(%rsp),$a0 mov `$SZ*(($i+2)&0xf)`(%rsp),$a0
___ ___
$code.=<<___; $code.=<<___;
lea $SZ($Tbl),$Tbl # round++ lea $STRIDE($Tbl),$Tbl # round++
add $a1,$h # h+=Sigma0(a) add $a1,$h # h+=Sigma0(a)
___ ___
...@@ -229,28 +245,34 @@ $code=<<___; ...@@ -229,28 +245,34 @@ $code=<<___;
.extern OPENSSL_ia32cap_P .extern OPENSSL_ia32cap_P
.globl $func .globl $func
.type $func,\@function,4 .type $func,\@function,3
.align 16 .align 16
$func: $func:
___ ___
$code.=<<___ if ($SZ==4 || $avx); $code.=<<___ if ($SZ==4 || $avx);
lea OPENSSL_ia32cap_P(%rip),%r11 lea OPENSSL_ia32cap_P(%rip),%r11
mov 0(%r11),%r10d mov 0(%r11),%r9d
mov 4(%r11),%r11d mov 4(%r11),%r10d
mov 8(%r11),%r11d
___ ___
$code.=<<___ if ($avx && $SZ==8); $code.=<<___ if ($avx && $SZ==8);
test \$`1<<11`,%r11d # check for XOP test \$`1<<11`,%r10d # check for XOP
jnz .Lxop_shortcut jnz .Lxop_shortcut
___ ___
$code.=<<___ if ($avx>1);
and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
cmp \$`1<<8|1<<5|1<<3`,%r11d
je .Lavx2_shortcut
___
$code.=<<___ if ($avx); $code.=<<___ if ($avx);
and \$`1<<30`,%r10d # mask "Intel CPU" bit and \$`1<<30`,%r9d # mask "Intel CPU" bit
and \$`1<<28|1<<9`,%r11d # mask AVX and SSSE3 bits and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
or %r10d,%r11d or %r9d,%r10d
cmp \$`1<<28|1<<9|1<<30`,%r11d cmp \$`1<<28|1<<9|1<<30`,%r10d
je .Lavx_shortcut je .Lavx_shortcut
___ ___
$code.=<<___ if ($SZ==4); $code.=<<___ if ($SZ==4);
test \$`1<<9`,%r11d test \$`1<<9`,%r10d
jnz .Lssse3_shortcut jnz .Lssse3_shortcut
___ ___
$code.=<<___; $code.=<<___;
...@@ -352,25 +374,44 @@ $code.=<<___; ...@@ -352,25 +374,44 @@ $code.=<<___;
.type $TABLE,\@object .type $TABLE,\@object
$TABLE: $TABLE:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___ ___
} else { } else {
...@@ -378,49 +419,90 @@ $code.=<<___; ...@@ -378,49 +419,90 @@ $code.=<<___;
.align 64 .align 64
.type $TABLE,\@object .type $TABLE,\@object
$TABLE: $TABLE:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f
.asciz "SHA512 block transfort for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .quad 0x0001020304050607,0x08090a0b0c0d0e0f
.asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___ ___
} }
...@@ -489,7 +571,7 @@ my @X = map("%xmm$_",(0..3)); ...@@ -489,7 +571,7 @@ my @X = map("%xmm$_",(0..3));
my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
$code.=<<___; $code.=<<___;
.type ${func}_ssse3,\@function,4 .type ${func}_ssse3,\@function,3
.align 64 .align 64
${func}_ssse3: ${func}_ssse3:
.Lssse3_shortcut: .Lssse3_shortcut:
...@@ -529,12 +611,12 @@ $code.=<<___; ...@@ -529,12 +611,12 @@ $code.=<<___;
___ ___
$code.=<<___; $code.=<<___;
movdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
jmp .Lloop_ssse3 jmp .Lloop_ssse3
.align 16 .align 16
.Lloop_ssse3: .Lloop_ssse3:
movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
movdqu 0x00($inp),@X[0] movdqu 0x00($inp),@X[0]
movdqu 0x10($inp),@X[1] movdqu 0x10($inp),@X[1]
movdqu 0x20($inp),@X[2] movdqu 0x20($inp),@X[2]
...@@ -544,11 +626,11 @@ $code.=<<___; ...@@ -544,11 +626,11 @@ $code.=<<___;
pshufb $t3,@X[1] pshufb $t3,@X[1]
movdqa 0x00($Tbl),$t0 movdqa 0x00($Tbl),$t0
pshufb $t3,@X[2] pshufb $t3,@X[2]
movdqa 0x10($Tbl),$t1 movdqa 0x20($Tbl),$t1
paddd @X[0],$t0 paddd @X[0],$t0
movdqa 0x20($Tbl),$t2 movdqa 0x40($Tbl),$t2
pshufb $t3,@X[3] pshufb $t3,@X[3]
movdqa 0x30($Tbl),$t3 movdqa 0x60($Tbl),$t3
paddd @X[1],$t1 paddd @X[1],$t1
paddd @X[2],$t2 paddd @X[2],$t2
paddd @X[3],$t3 paddd @X[3],$t3
...@@ -564,7 +646,7 @@ $code.=<<___; ...@@ -564,7 +646,7 @@ $code.=<<___;
.align 16 .align 16
.Lssse3_00_47: .Lssse3_00_47:
add \$16*$SZ,$Tbl sub \$-16*2*$SZ,$Tbl # size optimization
___ ___
sub Xupdate_256_SSSE3 () { sub Xupdate_256_SSSE3 () {
( (
...@@ -601,7 +683,7 @@ sub Xupdate_256_SSSE3 () { ...@@ -601,7 +683,7 @@ sub Xupdate_256_SSSE3 () {
'&pxor ($t3,$t2);', '&pxor ($t3,$t2);',
'&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
'&pxor ($t3,$t2);', '&pxor ($t3,$t2);',
'&movdqa ($t2,16*$j."($Tbl)")', '&movdqa ($t2,16*2*$j."($Tbl)")',
'&pshufb ($t3,$t5)', '&pshufb ($t3,$t5)',
'&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
); );
...@@ -744,7 +826,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions ...@@ -744,7 +826,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&pxor ($t3,$t2); &pxor ($t3,$t2);
eval(shift(@insns)); eval(shift(@insns));
eval(shift(@insns)); eval(shift(@insns));
&movdqa ($t2,16*$j."($Tbl)"); &movdqa ($t2,16*2*$j."($Tbl)");
eval(shift(@insns)); #@ eval(shift(@insns)); #@
eval(shift(@insns)); eval(shift(@insns));
&pshufb ($t3,$t5); &pshufb ($t3,$t5);
...@@ -767,7 +849,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions ...@@ -767,7 +849,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&SSSE3_256_00_47($j,\&body_00_15,@X); &SSSE3_256_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X) push(@X,shift(@X)); # rotate(@X)
} }
&cmpb ($SZ-1+16*$SZ."($Tbl)",0); &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
&jne (".Lssse3_00_47"); &jne (".Lssse3_00_47");
for ($i=0; $i<16; ) { for ($i=0; $i<16; ) {
...@@ -827,7 +909,7 @@ if ($avx) {{ ...@@ -827,7 +909,7 @@ if ($avx) {{
# #
if ($SZ==8) { # SHA512 only if ($SZ==8) { # SHA512 only
$code.=<<___; $code.=<<___;
.type ${func}_xop,\@function,4 .type ${func}_xop,\@function,3
.align 64 .align 64
${func}_xop: ${func}_xop:
.Lxop_shortcut: .Lxop_shortcut:
...@@ -878,7 +960,7 @@ ___ ...@@ -878,7 +960,7 @@ ___
$code.=<<___; $code.=<<___;
.align 16 .align 16
.Lloop_xop: .Lloop_xop:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0] vmovdqu 0x00($inp),@X[0]
vmovdqu 0x10($inp),@X[1] vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2] vmovdqu 0x20($inp),@X[2]
...@@ -889,9 +971,9 @@ $code.=<<___; ...@@ -889,9 +971,9 @@ $code.=<<___;
vpshufb $t3,@X[2],@X[2] vpshufb $t3,@X[2],@X[2]
vpaddd 0x00($Tbl),@X[0],$t0 vpaddd 0x00($Tbl),@X[0],$t0
vpshufb $t3,@X[3],@X[3] vpshufb $t3,@X[3],@X[3]
vpaddd 0x10($Tbl),@X[1],$t1 vpaddd 0x20($Tbl),@X[1],$t1
vpaddd 0x20($Tbl),@X[2],$t2 vpaddd 0x40($Tbl),@X[2],$t2
vpaddd 0x30($Tbl),@X[3],$t3 vpaddd 0x60($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp) vmovdqa $t0,0x00(%rsp)
mov $A,$a1 mov $A,$a1
vmovdqa $t1,0x10(%rsp) vmovdqa $t1,0x10(%rsp)
...@@ -904,7 +986,7 @@ $code.=<<___; ...@@ -904,7 +986,7 @@ $code.=<<___;
.align 16 .align 16
.Lxop_00_47: .Lxop_00_47:
add \$16*$SZ,$Tbl sub \$-16*2*$SZ,$Tbl # size optimization
___ ___
sub XOP_256_00_47 () { sub XOP_256_00_47 () {
my $j = shift; my $j = shift;
...@@ -1001,7 +1083,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions ...@@ -1001,7 +1083,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
eval(shift(@insns)); eval(shift(@insns));
eval(shift(@insns)); eval(shift(@insns));
eval(shift(@insns)); eval(shift(@insns));
&vpaddd ($t2,@X[0],16*$j."($Tbl)"); &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
foreach (@insns) { eval; } # remaining instructions foreach (@insns) { eval; } # remaining instructions
&vmovdqa (16*$j."(%rsp)",$t2); &vmovdqa (16*$j."(%rsp)",$t2);
} }
...@@ -1010,7 +1092,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions ...@@ -1010,7 +1092,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&XOP_256_00_47($j,\&body_00_15,@X); &XOP_256_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X) push(@X,shift(@X)); # rotate(@X)
} }
&cmpb ($SZ-1+16*$SZ."($Tbl)",0); &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
&jne (".Lxop_00_47"); &jne (".Lxop_00_47");
for ($i=0; $i<16; ) { for ($i=0; $i<16; ) {
...@@ -1024,9 +1106,9 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions ...@@ -1024,9 +1106,9 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
$code.=<<___; $code.=<<___;
.align 16 .align 16
.Lloop_xop: .Lloop_xop:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0] vmovdqu 0x00($inp),@X[0]
lea $TABLE(%rip),$Tbl lea $TABLE+0x80(%rip),$Tbl # size optimization
vmovdqu 0x10($inp),@X[1] vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2] vmovdqu 0x20($inp),@X[2]
vpshufb $t3,@X[0],@X[0] vpshufb $t3,@X[0],@X[0]
...@@ -1040,20 +1122,20 @@ $code.=<<___; ...@@ -1040,20 +1122,20 @@ $code.=<<___;
vpshufb $t3,@X[4],@X[4] vpshufb $t3,@X[4],@X[4]
vmovdqu 0x70($inp),@X[7] vmovdqu 0x70($inp),@X[7]
vpshufb $t3,@X[5],@X[5] vpshufb $t3,@X[5],@X[5]
vpaddq 0x00($Tbl),@X[0],$t0 vpaddq -0x80($Tbl),@X[0],$t0
vpshufb $t3,@X[6],@X[6] vpshufb $t3,@X[6],@X[6]
vpaddq 0x10($Tbl),@X[1],$t1 vpaddq -0x60($Tbl),@X[1],$t1
vpshufb $t3,@X[7],@X[7] vpshufb $t3,@X[7],@X[7]
vpaddq 0x20($Tbl),@X[2],$t2 vpaddq -0x40($Tbl),@X[2],$t2
vpaddq 0x30($Tbl),@X[3],$t3 vpaddq -0x20($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp) vmovdqa $t0,0x00(%rsp)
vpaddq 0x40($Tbl),@X[4],$t0 vpaddq 0x00($Tbl),@X[4],$t0
vmovdqa $t1,0x10(%rsp) vmovdqa $t1,0x10(%rsp)
vpaddq 0x50($Tbl),@X[5],$t1 vpaddq 0x20($Tbl),@X[5],$t1
vmovdqa $t2,0x20(%rsp) vmovdqa $t2,0x20(%rsp)
vpaddq 0x60($Tbl),@X[6],$t2 vpaddq 0x40($Tbl),@X[6],$t2
vmovdqa $t3,0x30(%rsp) vmovdqa $t3,0x30(%rsp)
vpaddq 0x70($Tbl),@X[7],$t3 vpaddq 0x60($Tbl),@X[7],$t3
vmovdqa $t0,0x40(%rsp) vmovdqa $t0,0x40(%rsp)
mov $A,$a1 mov $A,$a1
vmovdqa $t1,0x50(%rsp) vmovdqa $t1,0x50(%rsp)
...@@ -1066,7 +1148,7 @@ $code.=<<___; ...@@ -1066,7 +1148,7 @@ $code.=<<___;
.align 16 .align 16
.Lxop_00_47: .Lxop_00_47:
add \$16*$SZ,$Tbl add \$16*2*$SZ,$Tbl
___ ___
sub XOP_512_00_47 () { sub XOP_512_00_47 () {
my $j = shift; my $j = shift;
...@@ -1129,7 +1211,7 @@ my @insns = (&$body,&$body); # 52 instructions ...@@ -1129,7 +1211,7 @@ my @insns = (&$body,&$body); # 52 instructions
eval(shift(@insns)); eval(shift(@insns));
eval(shift(@insns)); eval(shift(@insns));
eval(shift(@insns)); eval(shift(@insns));
&vpaddq ($t2,@X[0],16*$j."($Tbl)"); &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
foreach (@insns) { eval; } # remaining instructions foreach (@insns) { eval; } # remaining instructions
&vmovdqa (16*$j."(%rsp)",$t2); &vmovdqa (16*$j."(%rsp)",$t2);
} }
...@@ -1138,7 +1220,7 @@ my @insns = (&$body,&$body); # 52 instructions ...@@ -1138,7 +1220,7 @@ my @insns = (&$body,&$body); # 52 instructions
&XOP_512_00_47($j,\&body_00_15,@X); &XOP_512_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X) push(@X,shift(@X)); # rotate(@X)
} }
&cmpb ($SZ-1+16*$SZ."($Tbl)",0); &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
&jne (".Lxop_00_47"); &jne (".Lxop_00_47");
for ($i=0; $i<16; ) { for ($i=0; $i<16; ) {
...@@ -1203,7 +1285,7 @@ ___ ...@@ -1203,7 +1285,7 @@ ___
local *ror = sub { &shrd(@_[0],@_) }; local *ror = sub { &shrd(@_[0],@_) };
$code.=<<___; $code.=<<___;
.type ${func}_avx,\@function,4 .type ${func}_avx,\@function,3
.align 64 .align 64
${func}_avx: ${func}_avx:
.Lavx_shortcut: .Lavx_shortcut:
...@@ -1251,12 +1333,12 @@ ___ ...@@ -1251,12 +1333,12 @@ ___
my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
$code.=<<___; $code.=<<___;
vmovdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
vmovdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
jmp .Lloop_avx jmp .Lloop_avx
.align 16 .align 16
.Lloop_avx: .Lloop_avx:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0] vmovdqu 0x00($inp),@X[0]
vmovdqu 0x10($inp),@X[1] vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2] vmovdqu 0x20($inp),@X[2]
...@@ -1267,9 +1349,9 @@ $code.=<<___; ...@@ -1267,9 +1349,9 @@ $code.=<<___;
vpshufb $t3,@X[2],@X[2] vpshufb $t3,@X[2],@X[2]
vpaddd 0x00($Tbl),@X[0],$t0 vpaddd 0x00($Tbl),@X[0],$t0
vpshufb $t3,@X[3],@X[3] vpshufb $t3,@X[3],@X[3]
vpaddd 0x10($Tbl),@X[1],$t1 vpaddd 0x20($Tbl),@X[1],$t1
vpaddd 0x20($Tbl),@X[2],$t2 vpaddd 0x40($Tbl),@X[2],$t2
vpaddd 0x30($Tbl),@X[3],$t3 vpaddd 0x60($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp) vmovdqa $t0,0x00(%rsp)
mov $A,$a1 mov $A,$a1
vmovdqa $t1,0x10(%rsp) vmovdqa $t1,0x10(%rsp)
...@@ -1282,7 +1364,7 @@ $code.=<<___; ...@@ -1282,7 +1364,7 @@ $code.=<<___;
.align 16 .align 16
.Lavx_00_47: .Lavx_00_47:
add \$16*$SZ,$Tbl sub \$-16*2*$SZ,$Tbl # size optimization
___ ___
sub Xupdate_256_AVX () { sub Xupdate_256_AVX () {
( (
...@@ -1330,7 +1412,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions ...@@ -1330,7 +1412,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
eval(shift(@insns)); eval(shift(@insns));
eval(shift(@insns)); eval(shift(@insns));
} }
&vpaddd ($t2,@X[0],16*$j."($Tbl)"); &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
foreach (@insns) { eval; } # remaining instructions foreach (@insns) { eval; } # remaining instructions
&vmovdqa (16*$j."(%rsp)",$t2); &vmovdqa (16*$j."(%rsp)",$t2);
} }
...@@ -1339,7 +1421,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions ...@@ -1339,7 +1421,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&AVX_256_00_47($j,\&body_00_15,@X); &AVX_256_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X) push(@X,shift(@X)); # rotate(@X)
} }
&cmpb ($SZ-1+16*$SZ."($Tbl)",0); &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
&jne (".Lavx_00_47"); &jne (".Lavx_00_47");
for ($i=0; $i<16; ) { for ($i=0; $i<16; ) {
...@@ -1354,9 +1436,9 @@ $code.=<<___; ...@@ -1354,9 +1436,9 @@ $code.=<<___;
jmp .Lloop_avx jmp .Lloop_avx
.align 16 .align 16
.Lloop_avx: .Lloop_avx:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0] vmovdqu 0x00($inp),@X[0]
lea $TABLE(%rip),$Tbl lea $TABLE+0x80(%rip),$Tbl # size optimization
vmovdqu 0x10($inp),@X[1] vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2] vmovdqu 0x20($inp),@X[2]
vpshufb $t3,@X[0],@X[0] vpshufb $t3,@X[0],@X[0]
...@@ -1370,20 +1452,20 @@ $code.=<<___; ...@@ -1370,20 +1452,20 @@ $code.=<<___;
vpshufb $t3,@X[4],@X[4] vpshufb $t3,@X[4],@X[4]
vmovdqu 0x70($inp),@X[7] vmovdqu 0x70($inp),@X[7]
vpshufb $t3,@X[5],@X[5] vpshufb $t3,@X[5],@X[5]
vpaddq 0x00($Tbl),@X[0],$t0 vpaddq -0x80($Tbl),@X[0],$t0
vpshufb $t3,@X[6],@X[6] vpshufb $t3,@X[6],@X[6]
vpaddq 0x10($Tbl),@X[1],$t1 vpaddq -0x60($Tbl),@X[1],$t1
vpshufb $t3,@X[7],@X[7] vpshufb $t3,@X[7],@X[7]
vpaddq 0x20($Tbl),@X[2],$t2 vpaddq -0x40($Tbl),@X[2],$t2
vpaddq 0x30($Tbl),@X[3],$t3 vpaddq -0x20($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp) vmovdqa $t0,0x00(%rsp)
vpaddq 0x40($Tbl),@X[4],$t0 vpaddq 0x00($Tbl),@X[4],$t0
vmovdqa $t1,0x10(%rsp) vmovdqa $t1,0x10(%rsp)
vpaddq 0x50($Tbl),@X[5],$t1 vpaddq 0x20($Tbl),@X[5],$t1
vmovdqa $t2,0x20(%rsp) vmovdqa $t2,0x20(%rsp)
vpaddq 0x60($Tbl),@X[6],$t2 vpaddq 0x40($Tbl),@X[6],$t2
vmovdqa $t3,0x30(%rsp) vmovdqa $t3,0x30(%rsp)
vpaddq 0x70($Tbl),@X[7],$t3 vpaddq 0x60($Tbl),@X[7],$t3
vmovdqa $t0,0x40(%rsp) vmovdqa $t0,0x40(%rsp)
mov $A,$a1 mov $A,$a1
vmovdqa $t1,0x50(%rsp) vmovdqa $t1,0x50(%rsp)
...@@ -1396,14 +1478,14 @@ $code.=<<___; ...@@ -1396,14 +1478,14 @@ $code.=<<___;
.align 16 .align 16
.Lavx_00_47: .Lavx_00_47:
add \$16*$SZ,$Tbl add \$16*2*$SZ,$Tbl
___ ___
sub Xupdate_512_AVX () { sub Xupdate_512_AVX () {
( (
'&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
'&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
'&vpsrlq ($t2,$t0,$sigma0[0]);', '&vpsrlq ($t2,$t0,$sigma0[0])',
'&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += X[9..10] '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
'&vpsrlq ($t3,$t0,$sigma0[2])', '&vpsrlq ($t3,$t0,$sigma0[2])',
'&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
'&vpxor ($t0,$t3,$t2)', '&vpxor ($t0,$t3,$t2)',
...@@ -1413,7 +1495,7 @@ sub Xupdate_512_AVX () { ...@@ -1413,7 +1495,7 @@ sub Xupdate_512_AVX () {
'&vpxor ($t0,$t0,$t2)', '&vpxor ($t0,$t0,$t2)',
'&vpsrlq ($t3,@X[7],$sigma1[2]);', '&vpsrlq ($t3,@X[7],$sigma1[2]);',
'&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
'&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1])', '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
'&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
'&vpsrlq ($t1,@X[7],$sigma1[0]);', '&vpsrlq ($t1,@X[7],$sigma1[0]);',
'&vpxor ($t3,$t3,$t2)', '&vpxor ($t3,$t3,$t2)',
...@@ -1437,7 +1519,7 @@ my @insns = (&$body,&$body); # 52 instructions ...@@ -1437,7 +1519,7 @@ my @insns = (&$body,&$body); # 52 instructions
eval(shift(@insns)); eval(shift(@insns));
eval(shift(@insns)); eval(shift(@insns));
} }
&vpaddq ($t2,@X[0],16*$j."($Tbl)"); &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
foreach (@insns) { eval; } # remaining instructions foreach (@insns) { eval; } # remaining instructions
&vmovdqa (16*$j."(%rsp)",$t2); &vmovdqa (16*$j."(%rsp)",$t2);
} }
...@@ -1446,7 +1528,7 @@ my @insns = (&$body,&$body); # 52 instructions ...@@ -1446,7 +1528,7 @@ my @insns = (&$body,&$body); # 52 instructions
&AVX_512_00_47($j,\&body_00_15,@X); &AVX_512_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X) push(@X,shift(@X)); # rotate(@X)
} }
&cmpb ($SZ-1+16*$SZ."($Tbl)",0); &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
&jne (".Lavx_00_47"); &jne (".Lavx_00_47");
for ($i=0; $i<16; ) { for ($i=0; $i<16; ) {
...@@ -1504,6 +1586,389 @@ $code.=<<___; ...@@ -1504,6 +1586,389 @@ $code.=<<___;
ret ret
.size ${func}_avx,.-${func}_avx .size ${func}_avx,.-${func}_avx
___ ___
if ($avx>1) {{
######################################################################
# AVX2+BMI code path
#
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
sub bodyx_00_15 () {
# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
'&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
'&and ($a4,$e)', # f&e
'&rorx ($a0,$e,$Sigma1[2])',
'&rorx ($a2,$e,$Sigma1[1])',
'&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
'&lea ($h,"($h,$a4)")',
'&andn ($a4,$e,$g)', # ~e&g
'&xor ($a0,$a2)',
'&rorx ($a1,$e,$Sigma1[0])',
'&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
'&xor ($a0,$a1)', # Sigma1(e)
'&mov ($a2,$a)',
'&rorx ($a4,$a,$Sigma0[2])',
'&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
'&xor ($a2,$b)', # a^b, b^c in next round
'&rorx ($a1,$a,$Sigma0[1])',
'&rorx ($a0,$a,$Sigma0[0])',
'&lea ($d,"($d,$h)")', # d+=h
'&and ($a3,$a2)', # (b^c)&(a^b)
'&xor ($a1,$a4)',
'&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
'&xor ($a1,$a0)', # Sigma0(a)
'&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
'&mov ($a4,$e)', # copy of f in future
'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
);
# and at the finish one has to $a+=$a1
}
$code.=<<___;
.type ${func}_avx2,\@function,3
.align 64
${func}_avx2:
.Lavx2_shortcut:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
shl \$4,%rdx # num*16
and \$-256*$SZ,%rsp # align stack frame
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
add \$`2*$SZ*($rounds-8)`,%rsp
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
movaps %xmm7,16*$SZ+48(%rsp)
movaps %xmm8,16*$SZ+64(%rsp)
movaps %xmm9,16*$SZ+80(%rsp)
___
$code.=<<___ if ($win64 && $SZ>4);
movaps %xmm10,16*$SZ+96(%rsp)
movaps %xmm11,16*$SZ+112(%rsp)
___
$code.=<<___;
.Lprologue_avx2:
vzeroall
sub \$-16*$SZ,$inp # inp++, size optimization
mov $SZ*0($ctx),$A
xor %r12,%r12 # borrow $T1
mov $SZ*1($ctx),$B
cmp %rdx,$inp # $_end
mov $SZ*2($ctx),$C
sete %r12b
mov $SZ*3($ctx),$D
mov $SZ*4($ctx),$E
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H
___
if ($SZ==4) { # SHA256
my @X = map("%ymm$_",(0..3));
my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
$code.=<<___;
vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
jmp .Loop_avx2
.align 16
.Loop_avx2:
shl \$`log(16*$SZ)/log(2)`,%r12
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
neg %r12
vmovdqu -16*$SZ+0($inp),$t0
add $inp,%r12
vmovdqu -16*$SZ+32($inp),$t1
vmovdqu (%r12),@X[2] # next or same input block
vmovdqu 32(%r12),@X[3]
vperm2i128 \$0x20,@X[2],$t0,@X[0]
#mov $inp,$_inp # offload $inp
vperm2i128 \$0x31,@X[2],$t0,@X[1]
vperm2i128 \$0x20,@X[3],$t1,@X[2]
vperm2i128 \$0x31,@X[3],$t1,@X[3]
lea $TABLE(%rip),$Tbl
vpshufb $t3,@X[0],@X[0]
vpshufb $t3,@X[1],@X[1]
vpshufb $t3,@X[2],@X[2]
vpaddd 0x00($Tbl),@X[0],$t0
vpshufb $t3,@X[3],@X[3]
vpaddd 0x20($Tbl),@X[1],$t1
vpaddd 0x40($Tbl),@X[2],$t2
vpaddd 0x60($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
xor $a1,$a1
vmovdqa $t1,0x20(%rsp)
lea -$PUSH8(%rsp),%rsp
mov $B,$a3
vmovdqa $t2,0x00(%rsp)
xor $C,$a3 # magic
vmovdqa $t3,0x20(%rsp)
mov $F,$a4
sub \$-16*2*$SZ,$Tbl # size optimization
jmp .Lavx2_00_47
.align 16
.Lavx2_00_47:
___
sub AVX2_256_00_47 () {
my $j = shift;
my $body = shift;
my @X = @_;
my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
my $base = "+2*$PUSH8(%rsp)";
&lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
foreach (Xupdate_256_AVX()) { # 29 instructions
eval;
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
}
&vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
foreach (@insns) { eval; } # remaining instructions
&vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
}
for ($i=0,$j=0; $j<4; $j++) {
&AVX2_256_00_47($j,\&bodyx_00_15,@X);
push(@X,shift(@X)); # rotate(@X)
}
&lea ($Tbl,16*2*$SZ."($Tbl)");
&cmpb (($SZ-1)."($Tbl)",0);
&jne (".Lavx2_00_47");
for ($i=0; $i<16; ) {
my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
foreach(bodyx_00_15()) { eval; }
}
} else { # SHA512
my @X = map("%ymm$_",(0..7));
my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
$code.=<<___;
jmp .Loop_avx2
.align 16
.Loop_avx2:
shl \$`log(16*$SZ)/log(2)`,%r12
vmovdqu -16*$SZ($inp),$t0
neg %r12
vmovdqu -16*$SZ+32($inp),$t1
add $inp,%r12
vmovdqu -16*$SZ+64($inp),$t2
vmovdqu -16*$SZ+96($inp),$t3
vmovdqu (%r12),@X[4] # next or same block
vmovdqu 32(%r12),@X[5]
vmovdqu 64(%r12),@X[6]
vmovdqu 96(%r12),@X[7]
vperm2i128 \$0x20,@X[4],$t0,@X[0]
#mov $inp,$_inp # offload $inp
vperm2i128 \$0x31,@X[4],$t0,@X[1]
vperm2i128 \$0x20,@X[5],$t1,@X[2]
vperm2i128 \$0x31,@X[5],$t1,@X[3]
vperm2i128 \$0x20,@X[6],$t2,@X[4]
vperm2i128 \$0x31,@X[6],$t2,@X[5]
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t2
vperm2i128 \$0x20,@X[7],$t3,@X[6]
vperm2i128 \$0x31,@X[7],$t3,@X[7]
vpshufb $t2,@X[0],@X[0]
lea $TABLE+0x80(%rip),$Tbl # size optimization
vpshufb $t2,@X[1],@X[1]
vpshufb $t2,@X[2],@X[2]
vpshufb $t2,@X[3],@X[3]
vpshufb $t2,@X[4],@X[4]
vpshufb $t2,@X[5],@X[5]
vpaddq -0x80($Tbl),@X[0],$t0
vpshufb $t2,@X[6],@X[6]
vpaddq -0x60($Tbl),@X[1],$t1
vpshufb $t2,@X[7],@X[7]
vpaddq -0x40($Tbl),@X[2],$t2
vpaddq -0x20($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
vpaddq 0x00($Tbl),@X[4],$t0
vmovdqa $t1,0x20(%rsp)
vpaddq 0x20($Tbl),@X[5],$t1
vmovdqa $t2,0x40(%rsp)
vpaddq 0x40($Tbl),@X[6],$t2
vmovdqa $t3,0x60(%rsp)
lea -$PUSH8(%rsp),%rsp
vpaddq 0x60($Tbl),@X[7],$t3
vmovdqa $t0,0x00(%rsp)
xor $a1,$a1
vmovdqa $t1,0x20(%rsp)
mov $B,$a3
vmovdqa $t2,0x40(%rsp)
xor $C,$a3 # magic
vmovdqa $t3,0x60(%rsp)
mov $F,$a4
add \$16*2*$SZ,$Tbl
jmp .Lavx2_00_47
.align 16
.Lavx2_00_47:
___
sub AVX2_512_00_47 () {
my $j = shift;
my $body = shift;
my @X = @_;
my @insns = (&$body,&$body); # 48 instructions
my $base = "+2*$PUSH8(%rsp)";
&lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
foreach (Xupdate_512_AVX()) { # 23 instructions
eval;
if ($_ !~ /\;$/) {
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
}
}
&vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
foreach (@insns) { eval; } # remaining instructions
&vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
}
for ($i=0,$j=0; $j<8; $j++) {
&AVX2_512_00_47($j,\&bodyx_00_15,@X);
push(@X,shift(@X)); # rotate(@X)
}
&lea ($Tbl,16*2*$SZ."($Tbl)");
&cmpb (($SZ-1-0x80)."($Tbl)",0);
&jne (".Lavx2_00_47");
for ($i=0; $i<16; ) {
my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
foreach(bodyx_00_15()) { eval; }
}
}
$code.=<<___;
mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
add $a1,$A
#mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
add $SZ*0($ctx),$A
add $SZ*1($ctx),$B
add $SZ*2($ctx),$C
add $SZ*3($ctx),$D
add $SZ*4($ctx),$E
add $SZ*5($ctx),$F
add $SZ*6($ctx),$G
add $SZ*7($ctx),$H
mov $A,$SZ*0($ctx)
mov $B,$SZ*1($ctx)
mov $C,$SZ*2($ctx)
mov $D,$SZ*3($ctx)
mov $E,$SZ*4($ctx)
mov $F,$SZ*5($ctx)
mov $G,$SZ*6($ctx)
mov $H,$SZ*7($ctx)
cmp `$PUSH8+2*8`($Tbl),$inp # $_end
je .Ldone_avx2
xor $a1,$a1
mov $B,$a3
xor $C,$a3 # magic
mov $F,$a4
jmp .Lower_avx2
.align 16
.Lower_avx2:
___
for ($i=0; $i<8; ) {
my $base="+16($Tbl)";
foreach(bodyx_00_15()) { eval; }
}
$code.=<<___;
lea -$PUSH8($Tbl),$Tbl
cmp %rsp,$Tbl
jae .Lower_avx2
mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
add $a1,$A
#mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
lea `2*$SZ*($rounds-8)`(%rsp),%rsp
add $SZ*0($ctx),$A
add $SZ*1($ctx),$B
add $SZ*2($ctx),$C
add $SZ*3($ctx),$D
add $SZ*4($ctx),$E
add $SZ*5($ctx),$F
lea `2*16*$SZ`($inp),$inp # inp+=2
add $SZ*6($ctx),$G
xor %r12,%r12
add $SZ*7($ctx),$H
cmp $_end,$inp
mov $A,$SZ*0($ctx)
mov $B,$SZ*1($ctx)
mov $C,$SZ*2($ctx)
mov $D,$SZ*3($ctx)
mov $E,$SZ*4($ctx)
mov $F,$SZ*5($ctx)
mov $G,$SZ*6($ctx)
mov $H,$SZ*7($ctx)
sete %r12b
jbe .Loop_avx2
lea (%rsp),$Tbl
.Ldone_avx2:
lea ($Tbl),%rsp
mov $_rsp,%rsi
vzeroall
___
$code.=<<___ if ($win64);
movaps 16*$SZ+32(%rsp),%xmm6
movaps 16*$SZ+48(%rsp),%xmm7
movaps 16*$SZ+64(%rsp),%xmm8
movaps 16*$SZ+80(%rsp),%xmm9
___
$code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+96(%rsp),%xmm10
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.Lepilogue_avx2:
ret
.size ${func}_avx2,.-${func}_avx2
___
}}
}}}}} }}}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
...@@ -1547,7 +2012,17 @@ se_handler: ...@@ -1547,7 +2012,17 @@ se_handler:
lea (%rsi,%r10),%r10 # epilogue label lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue jae .Lin_prologue
___
$code.=<<___ if ($avx>1);
lea .Lavx2_shortcut(%rip),%r10
cmp %r10,%rbx # context->Rip<avx2_shortcut
jb .Lnot_in_avx2
and \$-256*$SZ,%rax
add \$`2*$SZ*($rounds-8)`,%rax
.Lnot_in_avx2:
___
$code.=<<___;
mov %rax,%rsi # put aside Rsp mov %rax,%rsi # put aside Rsp
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
lea 48(%rax),%rax lea 48(%rax),%rax
...@@ -1635,6 +2110,11 @@ $code.=<<___ if ($avx); ...@@ -1635,6 +2110,11 @@ $code.=<<___ if ($avx);
.rva .LSEH_end_${func}_avx .rva .LSEH_end_${func}_avx
.rva .LSEH_info_${func}_avx .rva .LSEH_info_${func}_avx
___ ___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_${func}_avx2
.rva .LSEH_end_${func}_avx2
.rva .LSEH_info_${func}_avx2
___
$code.=<<___; $code.=<<___;
.section .xdata .section .xdata
.align 8 .align 8
...@@ -1661,6 +2141,12 @@ $code.=<<___ if ($avx); ...@@ -1661,6 +2141,12 @@ $code.=<<___ if ($avx);
.rva se_handler .rva se_handler
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___ ___
$code.=<<___ if ($avx>1);
.LSEH_info_${func}_avx2:
.byte 9,0,0,0
.rva se_handler
.rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
___
} }
$code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\`([^\`]*)\`/eval $1/gem;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册