提交 e3c79f0f 编写于 作者: A Andy Polyakov

sha/asm/keccak1600-avx512.pl: improve performance by 17%.

Improvement is result of combination of data layout ideas from
Keccak Code Package and initial version of this module.

Hardware used for benchmarking courtesy of Atos, experiments run by
Romain Dolbeau <romain.dolbeau@atos.net>. Kudos!
Reviewed-by: NBernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: NRich Salz <rsalz@openssl.org>
上级 e0de4dd5
...@@ -20,28 +20,60 @@ ...@@ -20,28 +20,60 @@
# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c). # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
# Pretty straightforward, the only "magic" is data layout in registers. # Pretty straightforward, the only "magic" is data layout in registers.
# It's impossible to have one that is optimal for every step, hence # It's impossible to have one that is optimal for every step, hence
# it's changing as algorithm progresses. Data is saved in order that # it's changing as algorithm progresses. Data is saved in linear order,
# benefits Chi, but at the same time is easily convertible to order # but in-register order morphs between rounds. Even rounds take in
# that benefits Theta. Conversion from Chi layout to Theta is # linear layout, and odd rounds - transposed, or "verticaly-shaped"...
# explicit and reverse one is kind of fused with Pi...
# #
######################################################################## ########################################################################
# Numbers are cycles per processed byte out of large message. # Numbers are cycles per processed byte out of large message.
# #
# r=1088(*) # r=1088(*)
# #
# Knights Landing 8.9 # Knights Landing 7.6
# Skylake-X 6.7 # Skylake-X 5.7
# #
# (*) Corresponds to SHA3-256. # (*) Corresponds to SHA3-256.
######################################################################## ########################################################################
# Coordinates below correspond to those in sha/keccak1600.c. Layout # Below code is combination of two ideas. One is taken from Keccak Code
# suitable for Chi is one with y coordinates aligned column-wise. Trick # Package, hereafter KCP, and another one from initial version of this
# is to add regular shift to x coordinate, so that Chi can still be # module. What is common is observation that Pi's input and output are
# performed with as little as 7 instructions, yet be converted to layout # "mostly transposed", i.e. if input is aligned by x coordinate, then
# suitable for Theta with intra-register permutations alone. Here is # output is [mostly] aligned by y. Both versions, KCP and predecessor,
# "magic" layout for Chi (with pre-Theta shuffle): # were trying to use one of them from round to round, which resulted in
# some kind of transposition in each round. This version still does
# transpose data, but only every second round. Another essential factor
# is that KCP transposition has to be performed with instructions that
# turned to be rather expensive on Knights Landing, both latency- and
# throughput-wise. Not to mention that some of them have to depend on
# each other. On the other hand initial version of this module was
# relying heavily on blend instructions. There were lots of them,
# resulting in higher instruction count, yet it performed better on
# Knights Landing, because processor can execute pair of them each
# cycle and they have minimal latency. This module is an attempt to
# bring best parts together:-)
#
# Coordinates below correspond to those in sha/keccak1600.c. Input
# layout is straight linear:
#
# [0][4] [0][3] [0][2] [0][1] [0][0]
# [1][4] [1][3] [1][2] [1][1] [1][0]
# [2][4] [2][3] [2][2] [2][1] [2][0]
# [3][4] [3][3] [3][2] [3][1] [3][0]
# [4][4] [4][3] [4][2] [4][1] [4][0]
#
# It's perfect for Theta, while Pi is reduced to intra-register
# permutations which yield layout perfect for Chi:
#
# [4][0] [3][0] [2][0] [1][0] [0][0]
# [4][1] [3][1] [2][1] [1][1] [0][1]
# [4][2] [3][2] [2][2] [1][2] [0][2]
# [4][3] [3][3] [2][3] [1][3] [0][3]
# [4][4] [3][4] [2][4] [1][4] [0][4]
#
# Now instead of performing full transposition and feeding it to next
# identical round, we perform kind of diagonal transposition to layout
# from initial version of this module, and make it suitable for Theta:
# #
# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0] # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0] # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
...@@ -49,53 +81,52 @@ ...@@ -49,53 +81,52 @@
# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0] # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0] # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
# #
# Layout suitable to Theta has x coordinates aligned column-wise # Now intra-register permutations yield initial [almost] straight
# [it's interleaved with Pi indices transformation for reference]: # linear layout:
# #
# [4][4] [3][3] [2][2] [1][1] [0][0] $A00 # [4][4] [3][3] [2][2] [1][1] [0][0]
##[0][4] [0][3] [0][2] [0][1] [0][0] ##[0][4] [0][3] [0][2] [0][1] [0][0]
# [3][4] [2][3] [1][2] [0][1] [4][0] $A01 # [3][4] [2][3] [1][2] [0][1] [4][0]
##[2][3] [2][2] [2][1] [2][0] [2][4] ##[2][3] [2][2] [2][1] [2][0] [2][4]
# [2][4] [1][3] [0][2] [4][1] [3][0] $A02 # [2][4] [1][3] [0][2] [4][1] [3][0]
##[4][2] [4][1] [4][0] [4][4] [4][3] ##[4][2] [4][1] [4][0] [4][4] [4][3]
# [1][4] [0][3] [4][2] [3][1] [2][0] $A03 # [1][4] [0][3] [4][2] [3][1] [2][0]
##[1][1] [1][0] [1][4] [1][3] [1][2] ##[1][1] [1][0] [1][4] [1][3] [1][2]
# [0][4] [4][3] [3][2] [2][1] [1][0] $A04 # [0][4] [4][3] [3][2] [2][1] [1][0]
##[3][0] [3][4] [3][3] [3][2] [3][1] ##[3][0] [3][4] [3][3] [3][2] [3][1]
# #
# Pi itself is performed by blending above data and finally shuffling it # This means that odd round Chi is performed in less suitable layout,
# to original Chi layout: # with a number of additional permutations. But overall it turned to be
# # a win. Permutations are fastest possible on Knights Landing and they
# [1][1] [2][2] [3][3] [4][4] [0][0]>1.2.3.4.0>[4][4] [3][3] [2][2] [1][1] [0][0] # are laid down to be independent of each other. In the essence I traded
# [2][3] [3][4] [4][0] [0][1] [1][2]>2.3.4.0.1>[4][0] [3][4] [2][3] [1][2] [0][1] # 20 blend instructions for 3 permutations. The result is 13% faster
# [3][0] [4][1] [0][2] [1][3] [2][4]>3.4.0.1.2>[4][1] [3][0] [2][4] [1][3] [0][2] # than KCP on Skylake-X, and >40% on Knights Landing.
# [4][2] [0][3] [1][4] [2][0] [3][1]>4.0.1.2.3>[4][2] [3][1] [2][0] [1][4] [0][3]
# [0][4] [1][0] [2][1] [3][2] [4][3]>0.1.2.3.4>[4][3] [3][2] [2][1] [1][0] [0][4]
# #
# As implied, data is loaded in Chi layout. Digits in variables' names # As implied, data is loaded in straight linear order. Digits in
# represent right most coordinates of loaded data chunk: # variables' names represent coordinates of right-most element of
# loaded data chunk:
my ($A00, # [4][4] [3][3] [2][2] [1][1] [0][0]
$A01, # [4][0] [3][4] [2][3] [1][2] [0][1] my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
$A02, # [4][1] [3][0] [2][4] [1][3] [0][2] $A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
$A03, # [4][2] [3][1] [2][0] [1][4] [0][3] $A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
$A04) = # [4][3] [3][2] [2][1] [1][0] [0][4] $A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
$A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
map("%zmm$_",(0..4)); map("%zmm$_",(0..4));
# We also need to map the magic order into offsets within structure: # We also need to map the magic order into offsets within structure:
my @A_jagged = ([0,0], [1,0], [2,0], [3,0], [4,0], my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
[4,1], [0,1], [1,1], [2,1], [3,1], [1,0], [1,1], [1,2], [1,3], [1,4],
[3,2], [4,2], [0,2], [1,2], [2,2], [2,0], [2,1], [2,2], [2,3], [2,4],
[2,3], [3,3], [4,3], [0,3], [1,3], [3,0], [3,1], [3,2], [3,3], [3,4],
[1,4], [2,4], [3,4], [4,4], [0,4]); [4,0], [4,1], [4,2], [4,3], [4,4]);
@A_jagged_in = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear
@A_jagged_out = map(8*($$_[0]*5+$$_[1]), @A_jagged); # ... and now linear
my @T = map("%zmm$_",(5..7,16..17)); my @T = map("%zmm$_",(5..12));
my @Chi = map("%zmm$_",(18..22)); my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
my @Theta = map("%zmm$_",(33,23..26)); # invalid @Theta[0] is not typo my @Pi0 = map("%zmm$_",(17..21));
my @Rhotate = map("%zmm$_",(27..31)); my @Rhotate0 = map("%zmm$_",(22..26));
my @Rhotate1 = map("%zmm$_",(27..31));
my ($C00,$D00) = @T[0..1]; my ($C00,$D00) = @T[0..1];
my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6)); my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
...@@ -107,82 +138,136 @@ $code.=<<___; ...@@ -107,82 +138,136 @@ $code.=<<___;
.align 32 .align 32
__KeccakF1600: __KeccakF1600:
lea iotas(%rip),%r10 lea iotas(%rip),%r10
mov \$24,%eax mov \$12,%eax
jmp .Loop_avx512 jmp .Loop_avx512
.align 32 .align 32
.Loop_avx512: .Loop_avx512:
######################################### Theta ######################################### Theta, even round
#vpermq $A00,@Theta[0],$A00 # doesn't actually change order
vpermq $A01,@Theta[1],$A01
vpermq $A02,@Theta[2],$A02
vpermq $A03,@Theta[3],$A03
vpermq $A04,@Theta[4],$A04
vmovdqa64 $A00,@T[0] # put aside original A00 vmovdqa64 $A00,@T[0] # put aside original A00
vpternlogq \$0x96,$A02,$A01,$A00 # and use it as "C00" vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
vpternlogq \$0x96,$A04,$A03,$A00 vpternlogq \$0x96,$A40,$A30,$A00
vprolq \$1,$A00,$D00 vprolq \$1,$A00,$D00
vpermq $A00,@Theta[1],$A00 vpermq $A00,@Theta[1],$A00
vpermq $D00,@Theta[4],$D00 vpermq $D00,@Theta[4],$D00
vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00 vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
vpternlogq \$0x96,$A00,$D00,$A01 vpternlogq \$0x96,$A00,$D00,$A10
vpternlogq \$0x96,$A00,$D00,$A02 vpternlogq \$0x96,$A00,$D00,$A20
vpternlogq \$0x96,$A00,$D00,$A03 vpternlogq \$0x96,$A00,$D00,$A30
vpternlogq \$0x96,$A00,$D00,$A04 vpternlogq \$0x96,$A00,$D00,$A40
######################################### Rho ######################################### Rho
vprolvq @Rhotate[0],@T[0],$A00 # T[0] is original A00 vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
vprolvq @Rhotate[1],$A01,$A01 vprolvq @Rhotate0[1],$A10,$A10
vprolvq @Rhotate[2],$A02,$A02 vprolvq @Rhotate0[2],$A20,$A20
vprolvq @Rhotate[3],$A03,$A03 vprolvq @Rhotate0[3],$A30,$A30
vprolvq @Rhotate[4],$A04,$A04 vprolvq @Rhotate0[4],$A40,$A40
######################################### Pi ######################################### Pi
vpblendmq $A02,$A00,@{T[0]}{$k00010} vpermq $A00,@Pi0[0],$A00
vpblendmq $A00,$A03,@{T[1]}{$k00010} vpermq $A10,@Pi0[1],$A10
vpblendmq $A03,$A01,@{T[2]}{$k00010} vpermq $A20,@Pi0[2],$A20
vpblendmq $A01,$A04,@{T[3]}{$k00010} vpermq $A30,@Pi0[3],$A30
vpblendmq $A04,$A02,@{T[4]}{$k00010} vpermq $A40,@Pi0[4],$A40
vpblendmq $A04,@T[0],@{T[0]}{$k00100}
vpblendmq $A02,@T[1],@{T[1]}{$k00100}
vpblendmq $A00,@T[2],@{T[2]}{$k00100}
vpblendmq $A03,@T[3],@{T[3]}{$k00100}
vpblendmq $A01,@T[4],@{T[4]}{$k00100}
vpblendmq $A01,@T[0],@{T[0]}{$k01000}
vpblendmq $A04,@T[1],@{T[1]}{$k01000}
vpblendmq $A02,@T[2],@{T[2]}{$k01000}
vpblendmq $A00,@T[3],@{T[3]}{$k01000}
vpblendmq $A03,@T[4],@{T[4]}{$k01000}
vpblendmq $A03,@T[0],@{T[0]}{$k10000}
vpblendmq $A01,@T[1],@{T[1]}{$k10000}
vpblendmq $A04,@T[2],@{T[2]}{$k10000}
vpblendmq $A02,@T[3],@{T[3]}{$k10000}
vpblendmq $A00,@T[4],@{T[4]}{$k10000}
vpermq @T[0],@Chi[0],$A00
vpermq @T[1],@Chi[1],$A01
vpermq @T[2],@Chi[2],$A02
vpermq @T[3],@Chi[3],$A03
vpermq @T[4],@Chi[4],$A04
######################################### Chi ######################################### Chi
vmovdqa64 $A00,@T[0] vmovdqa64 $A00,@T[0]
vpternlogq \$0xD2,$A02,$A01,$A00 vmovdqa64 $A10,@T[1]
vmovdqa64 $A01,@T[1] vpternlogq \$0xD2,$A20,$A10,$A00
vpternlogq \$0xD2,$A03,$A02,$A01 vpternlogq \$0xD2,$A30,$A20,$A10
vpternlogq \$0xD2,$A04,$A03,$A02 vpternlogq \$0xD2,$A40,$A30,$A20
vpternlogq \$0xD2,@T[0],$A04,$A03 vpternlogq \$0xD2,@T[0],$A40,$A30
vpternlogq \$0xD2,@T[1],@T[0],$A04 vpternlogq \$0xD2,@T[1],@T[0],$A40
######################################### Iota ######################################### Iota
vpxorq (%r10),$A00,${A00}{$k00001} vpxorq (%r10),$A00,${A00}{$k00001}
lea 8(%r10),%r10 lea 16(%r10),%r10
######################################### Harmonize rounds
vpblendmq $A20,$A10,@{T[1]}{$k00010}
vpblendmq $A30,$A20,@{T[2]}{$k00010}
vpblendmq $A40,$A30,@{T[3]}{$k00010}
vpblendmq $A10,$A00,@{T[0]}{$k00010}
vpblendmq $A00,$A40,@{T[4]}{$k00010}
vpblendmq $A30,@T[1],@{T[1]}{$k00100}
vpblendmq $A40,@T[2],@{T[2]}{$k00100}
vpblendmq $A20,@T[0],@{T[0]}{$k00100}
vpblendmq $A00,@T[3],@{T[3]}{$k00100}
vpblendmq $A10,@T[4],@{T[4]}{$k00100}
vpblendmq $A40,@T[1],@{T[1]}{$k01000}
vpblendmq $A30,@T[0],@{T[0]}{$k01000}
vpblendmq $A00,@T[2],@{T[2]}{$k01000}
vpblendmq $A10,@T[3],@{T[3]}{$k01000}
vpblendmq $A20,@T[4],@{T[4]}{$k01000}
vpblendmq $A40,@T[0],@{T[0]}{$k10000}
vpblendmq $A00,@T[1],@{T[1]}{$k10000}
vpblendmq $A10,@T[2],@{T[2]}{$k10000}
vpblendmq $A20,@T[3],@{T[3]}{$k10000}
vpblendmq $A30,@T[4],@{T[4]}{$k10000}
#vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
vpermq @T[1],@Theta[1],$A10
vpermq @T[2],@Theta[2],$A20
vpermq @T[3],@Theta[3],$A30
vpermq @T[4],@Theta[4],$A40
######################################### Theta, odd round
vmovdqa64 $T[0],$A00 # real A00
vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
vpternlogq \$0x96,$A40,$A30,$C00
vprolq \$1,$C00,$D00
vpermq $C00,@Theta[1],$C00
vpermq $D00,@Theta[4],$D00
vpternlogq \$0x96,$C00,$D00,$A00
vpternlogq \$0x96,$C00,$D00,$A30
vpternlogq \$0x96,$C00,$D00,$A10
vpternlogq \$0x96,$C00,$D00,$A40
vpternlogq \$0x96,$C00,$D00,$A20
######################################### Rho
vprolvq @Rhotate1[0],$A00,$A00
vprolvq @Rhotate1[3],$A30,@T[1]
vprolvq @Rhotate1[1],$A10,@T[2]
vprolvq @Rhotate1[4],$A40,@T[3]
vprolvq @Rhotate1[2],$A20,@T[4]
vpermq $A00,@Theta[4],@T[5]
vpermq $A00,@Theta[3],@T[6]
######################################### Iota
vpxorq -8(%r10),$A00,${A00}{$k00001}
######################################### Pi
vpermq @T[1],@Theta[2],$A10
vpermq @T[2],@Theta[4],$A20
vpermq @T[3],@Theta[1],$A30
vpermq @T[4],@Theta[3],$A40
######################################### Chi
vpternlogq \$0xD2,@T[6],@T[5],$A00
vpermq @T[1],@Theta[1],@T[7]
#vpermq @T[1],@Theta[0],@T[1]
vpternlogq \$0xD2,@T[1],@T[7],$A10
vpermq @T[2],@Theta[3],@T[0]
vpermq @T[2],@Theta[2],@T[2]
vpternlogq \$0xD2,@T[2],@T[0],$A20
#vpermq @T[3],@Theta[0],@T[3]
vpermq @T[3],@Theta[4],@T[1]
vpternlogq \$0xD2,@T[1],@T[3],$A30
vpermq @T[4],@Theta[2],@T[0]
vpermq @T[4],@Theta[1],@T[4]
vpternlogq \$0xD2,@T[4],@T[0],$A40
dec %eax dec %eax
jnz .Loop_avx512 jnz .Loop_avx512
...@@ -208,8 +293,6 @@ SHA3_absorb: ...@@ -208,8 +293,6 @@ SHA3_absorb:
lea 96($inp),$inp lea 96($inp),$inp
lea 128(%rsp),%r9 lea 128(%rsp),%r9
vzeroupper
lea theta_perm(%rip),%r8 lea theta_perm(%rip),%r8
kxnorw $k11111,$k11111,$k11111 kxnorw $k11111,$k11111,$k11111
...@@ -226,24 +309,30 @@ SHA3_absorb: ...@@ -226,24 +309,30 @@ SHA3_absorb:
vmovdqa64 64*3(%r8),@Theta[3] vmovdqa64 64*3(%r8),@Theta[3]
vmovdqa64 64*4(%r8),@Theta[4] vmovdqa64 64*4(%r8),@Theta[4]
vmovdqa64 64*5(%r8),@Rhotate[0] vmovdqa64 64*5(%r8),@Rhotate1[0]
vmovdqa64 64*6(%r8),@Rhotate[1] vmovdqa64 64*6(%r8),@Rhotate1[1]
vmovdqa64 64*7(%r8),@Rhotate[2] vmovdqa64 64*7(%r8),@Rhotate1[2]
vmovdqa64 64*8(%r8),@Rhotate[3] vmovdqa64 64*8(%r8),@Rhotate1[3]
vmovdqa64 64*9(%r8),@Rhotate[4] vmovdqa64 64*9(%r8),@Rhotate1[4]
vmovdqa64 64*10(%r8),@Rhotate0[0]
vmovdqa64 64*11(%r8),@Rhotate0[1]
vmovdqa64 64*12(%r8),@Rhotate0[2]
vmovdqa64 64*13(%r8),@Rhotate0[3]
vmovdqa64 64*14(%r8),@Rhotate0[4]
vmovdqa64 64*10(%r8),@Chi[0] vmovdqa64 64*15(%r8),@Pi0[0]
vmovdqa64 64*11(%r8),@Chi[1] vmovdqa64 64*16(%r8),@Pi0[1]
vmovdqa64 64*12(%r8),@Chi[2] vmovdqa64 64*17(%r8),@Pi0[2]
vmovdqa64 64*13(%r8),@Chi[3] vmovdqa64 64*18(%r8),@Pi0[3]
vmovdqa64 64*14(%r8),@Chi[4] vmovdqa64 64*19(%r8),@Pi0[4]
vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
vpxorq @T[0],@T[0],@T[0] vpxorq @T[0],@T[0],@T[0]
vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z} vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z} vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z} vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z} vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
vmovdqa64 @T[0],1*64-128(%r9) vmovdqa64 @T[0],1*64-128(%r9)
...@@ -263,7 +352,7 @@ ___ ...@@ -263,7 +352,7 @@ ___
for(my $i=0; $i<25; $i++) { for(my $i=0; $i<25; $i++) {
$code.=<<___ $code.=<<___
mov 8*$i-96($inp),%r8 mov 8*$i-96($inp),%r8
mov %r8,$A_jagged_in[$i]-128(%r9) mov %r8,$A_jagged[$i]-128(%r9)
dec %eax dec %eax
jz .Labsorved_avx512 jz .Labsorved_avx512
___ ___
...@@ -273,10 +362,10 @@ $code.=<<___; ...@@ -273,10 +362,10 @@ $code.=<<___;
lea ($inp,$bsz),$inp lea ($inp,$bsz),$inp
vpxorq 64*0-128(%r9),$A00,$A00 vpxorq 64*0-128(%r9),$A00,$A00
vpxorq 64*1-128(%r9),$A01,$A01 vpxorq 64*1-128(%r9),$A10,$A10
vpxorq 64*2-128(%r9),$A02,$A02 vpxorq 64*2-128(%r9),$A20,$A20
vpxorq 64*3-128(%r9),$A03,$A03 vpxorq 64*3-128(%r9),$A30,$A30
vpxorq 64*4-128(%r9),$A04,$A04 vpxorq 64*4-128(%r9),$A40,$A40
call __KeccakF1600 call __KeccakF1600
...@@ -285,10 +374,10 @@ $code.=<<___; ...@@ -285,10 +374,10 @@ $code.=<<___;
.align 32 .align 32
.Ldone_absorb_avx512: .Ldone_absorb_avx512:
vmovdqu64 $A00,40*0-96($A_flat){$k11111} vmovdqu64 $A00,40*0-96($A_flat){$k11111}
vmovdqu64 $A01,40*1-96($A_flat){$k11111} vmovdqu64 $A10,40*1-96($A_flat){$k11111}
vmovdqu64 $A02,40*2-96($A_flat){$k11111} vmovdqu64 $A20,40*2-96($A_flat){$k11111}
vmovdqu64 $A03,40*3-96($A_flat){$k11111} vmovdqu64 $A30,40*3-96($A_flat){$k11111}
vmovdqu64 $A04,40*4-96($A_flat){$k11111} vmovdqu64 $A40,40*4-96($A_flat){$k11111}
vzeroupper vzeroupper
...@@ -307,8 +396,6 @@ SHA3_squeeze: ...@@ -307,8 +396,6 @@ SHA3_squeeze:
cmp $bsz,$len cmp $bsz,$len
jbe .Lno_output_extension_avx512 jbe .Lno_output_extension_avx512
vzeroupper
lea theta_perm(%rip),%r8 lea theta_perm(%rip),%r8
kxnorw $k11111,$k11111,$k11111 kxnorw $k11111,$k11111,$k11111
...@@ -325,65 +412,72 @@ SHA3_squeeze: ...@@ -325,65 +412,72 @@ SHA3_squeeze:
vmovdqa64 64*3(%r8),@Theta[3] vmovdqa64 64*3(%r8),@Theta[3]
vmovdqa64 64*4(%r8),@Theta[4] vmovdqa64 64*4(%r8),@Theta[4]
vmovdqa64 64*5(%r8),@Rhotate[0] vmovdqa64 64*5(%r8),@Rhotate1[0]
vmovdqa64 64*6(%r8),@Rhotate[1] vmovdqa64 64*6(%r8),@Rhotate1[1]
vmovdqa64 64*7(%r8),@Rhotate[2] vmovdqa64 64*7(%r8),@Rhotate1[2]
vmovdqa64 64*8(%r8),@Rhotate[3] vmovdqa64 64*8(%r8),@Rhotate1[3]
vmovdqa64 64*9(%r8),@Rhotate[4] vmovdqa64 64*9(%r8),@Rhotate1[4]
vmovdqa64 64*10(%r8),@Rhotate0[0]
vmovdqa64 64*11(%r8),@Rhotate0[1]
vmovdqa64 64*12(%r8),@Rhotate0[2]
vmovdqa64 64*13(%r8),@Rhotate0[3]
vmovdqa64 64*14(%r8),@Rhotate0[4]
vmovdqa64 64*10(%r8),@Chi[0] vmovdqa64 64*15(%r8),@Pi0[0]
vmovdqa64 64*11(%r8),@Chi[1] vmovdqa64 64*16(%r8),@Pi0[1]
vmovdqa64 64*12(%r8),@Chi[2] vmovdqa64 64*17(%r8),@Pi0[2]
vmovdqa64 64*13(%r8),@Chi[3] vmovdqa64 64*18(%r8),@Pi0[3]
vmovdqa64 64*14(%r8),@Chi[4] vmovdqa64 64*19(%r8),@Pi0[4]
vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z} vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z} vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z} vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z} vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
.Lno_output_extension_avx512: .Lno_output_extension_avx512:
shr \$3,$bsz shr \$3,$bsz
lea -96($A_flat),%r9
mov $bsz,%rax mov $bsz,%rax
jmp .Loop_squeeze_avx512
.align 32
.Loop_squeeze_avx512: .Loop_squeeze_avx512:
mov @A_jagged_out[$i]-96($A_flat),%r8 cmp \$8,$len
___ jb .Ltail_squeeze_avx512
for (my $i=0; $i<25; $i++) {
$code.=<<___; mov (%r9),%r8
sub \$8,$len lea 8(%r9),%r9
jc .Ltail_squeeze_avx512
mov %r8,($out) mov %r8,($out)
lea 8($out),$out lea 8($out),$out
je .Ldone_squeeze_avx512 sub \$8,$len # len -= 8
dec %eax jz .Ldone_squeeze_avx512
je .Lextend_output_avx512
mov @A_jagged_out[$i+1]-96($A_flat),%r8 sub \$1,%rax # bsz--
___ jnz .Loop_squeeze_avx512
}
$code.=<<___; #vpermq @Theta[4],@Theta[4],@Theta[3]
.Lextend_output_avx512: #vpermq @Theta[3],@Theta[4],@Theta[2]
call __KeccakF1600 #vpermq @Theta[3],@Theta[3],@Theta[1]
call __KeccakF1600
vmovdqu64 $A00,40*0-96($A_flat){$k11111} vmovdqu64 $A00,40*0-96($A_flat){$k11111}
vmovdqu64 $A01,40*1-96($A_flat){$k11111} vmovdqu64 $A10,40*1-96($A_flat){$k11111}
vmovdqu64 $A02,40*2-96($A_flat){$k11111} vmovdqu64 $A20,40*2-96($A_flat){$k11111}
vmovdqu64 $A03,40*3-96($A_flat){$k11111} vmovdqu64 $A30,40*3-96($A_flat){$k11111}
vmovdqu64 $A04,40*4-96($A_flat){$k11111} vmovdqu64 $A40,40*4-96($A_flat){$k11111}
lea -96($A_flat),%r9
mov $bsz,%rax mov $bsz,%rax
jmp .Loop_squeeze_avx512 jmp .Loop_squeeze_avx512
.Ltail_squeeze_avx512: .Ltail_squeeze_avx512:
add \$8,$len mov %r9,%rsi
.Loop_tail_avx512: mov $out,%rdi
mov %r8b,($out) mov $len,%rcx
lea 1($out),$out .byte 0xf3,0xa4 # rep movsb
shr \$8,%r8
dec $len
jnz .Loop_tail_avx512
.Ldone_squeeze_avx512: .Ldone_squeeze_avx512:
vzeroupper vzeroupper
...@@ -400,19 +494,27 @@ theta_perm: ...@@ -400,19 +494,27 @@ theta_perm:
.quad 2, 3, 4, 0, 1, 5, 6, 7 .quad 2, 3, 4, 0, 1, 5, 6, 7
.quad 1, 2, 3, 4, 0, 5, 6, 7 .quad 1, 2, 3, 4, 0, 5, 6, 7
rhotates: rhotates1:
.quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
.quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
.quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
.quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
.quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
chi_perm: rhotates0:
.quad 0, 4, 3, 2, 1, 5, 6, 7 .quad 0, 1, 62, 28, 27, 0, 0, 0
.quad 1, 0, 4, 3, 2, 5, 6, 7 .quad 36, 44, 6, 55, 20, 0, 0, 0
.quad 2, 1, 0, 4, 3, 5, 6, 7 .quad 3, 10, 43, 25, 39, 0, 0, 0
.quad 3, 2, 1, 0, 4, 5, 6, 7 .quad 41, 45, 15, 21, 8, 0, 0, 0
.quad 4, 3, 2, 1, 0, 5, 6, 7 .quad 18, 2, 61, 56, 14, 0, 0, 0
pi0_perm:
.quad 0, 3, 1, 4, 2, 5, 6, 7
.quad 1, 4, 2, 0, 3, 5, 6, 7
.quad 2, 0, 3, 1, 4, 5, 6, 7
.quad 3, 1, 4, 2, 0, 5, 6, 7
.quad 4, 2, 0, 3, 1, 5, 6, 7
iotas: iotas:
.quad 0x0000000000000001 .quad 0x0000000000000001
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册