提交 91dbdc63 编写于 作者: A Andy Polyakov

sha/asm/keccak1600-avx2.pl: remodel register usage.

This gives much more freedom to rearrange instructions. This is
unoptimized version, provided for reference. Basically you need
to compare it to initial 29724d0e
to figure out the key difference.
Reviewed-by: NRich Salz <rsalz@openssl.org>
上级 74df8c4c
......@@ -111,16 +111,10 @@ my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
#
# r=1088(*)
#
# Haswell 8.9/+8%
# Skylake 7.9/+19%
# Ryzen 17(**)
# Haswell 9.5
# Skylake 8.8
#
# (*) Corresponds to SHA3-256. Percentage after slash is improvement
# coefficient in comparison to scalar keccak1600-x86_64.pl.
# (**) It's expected that Ryzen performs poorly, because instruction
# issue rate is limited to two AVX2 instructions per cycle and
# in addition vpblendd is reportedly bound to specific port.
# Obviously this code path should not be executed on Ryzen.
# (*) Corresponds to SHA3-256.
my @T = map("%ymm$_",(7..15));
my ($C14,$C00,$D00,$D14) = @T[5..8];
......@@ -140,135 +134,137 @@ __KeccakF1600:
.align 32
.Loop_avx2:
######################################### Theta
vpshufd \$0b01001110,$A20,$C00
vpxor $A31,$A01,$C14
vpxor $A41,$A21,@T[0]
vpxor $A11,$C14,$C14
vpxor @T[0],$C14,$C14 # C[1..4]
vpermq \$0b11111111,$C14,@T[3]
vpermq \$0b10010011,$C14,@T[4]
vpxor $A01,$A31,$C14
vpxor $A21,$C14,$C14
vpxor $A41,$C14,$C14
vpxor $A11,$C14,$C14 # C[1..4]
vpermq \$0b10110001,$A20,$C00
vpxor $A20,$C00,$C00
vpermq \$0b01001110,$C00,@T[0]
vpxor $A00,$C00,$C00
vpxor @T[0],$C00,$C00 # C[0..0]
vpsrlq \$63,$C14,@T[1]
vpaddq $C14,$C14,@T[2]
vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
vpaddq $C14,$C14,@T[3]
vpor @T[3],@T[1],@T[1] # ROL64(C[1..4],1)
vpermq \$0b00111001,@T[1],$D14
vpxor @T[3],@T[1],$D00
vpsrlq \$63,$C00,@T[0]
vpaddq $C00,$C00,@T[2]
vpor @T[2],@T[0],@T[0] # ROL64(C[0..0],1)
vpxor $A00,$C00,$C00
vpxor @T[0],$C00,$C00 # C[0..0]
vpermq \$0b00000000,@T[1],$D00
vpermq \$0b11111111,$C14,@T[3]
vpxor @T[3],$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
vpsrlq \$63,$C00,@T[0]
vpaddq $C00,$C00,@T[1]
vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
vpermq \$0b00111001,@T[1],$D14
vpblendd \$0b11000000,@T[0],$D14,$D14
vpermq \$0b10010011,$C14,@T[2]
vpblendd \$0b00000011,$C00,@T[2],@T[2]
vpxor @T[2],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
vpxor $D00,$A20,$A20 # ^= D[0..0]
vpxor $D00,$A00,$A00 # ^= D[0..0]
vpxor $D00,$A20,$A20 # ^= D[0..0]
vpxor $D14,$A01,$A01 # ^= D[1..4]
vpxor $D14,$A31,$A31 # ^= D[1..4]
vpxor $D14,$A21,$A21 # ^= D[1..4]
vpxor $D14,$A41,$A41 # ^= D[1..4]
vpxor $D14,$A11,$A11 # ^= D[1..4]
vpblendd \$0b11000000,@T[1],$D14,$D14
vpblendd \$0b00000011,$C00,@T[4],@T[4]
vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
######################################### Rho + Pi + pre-Chi shuffle
######################################### Rho
vpsllvq 0*32-96(%r8),$A20,@T[0]
vpsrlvq 0*32-96(%r9),$A20,$A20
vpor $A20,@T[0],@T[0] # $A20
vpor @T[0],$A20,$A20
vpsllvq 1*32-96(%r8),$A01,@T[1]
vpsrlvq 1*32-96(%r9),$A01,$A01
vpor @T[1],$A01,$A01
vpxor $D14,$A31,$A31 # ^= D[1..4]
vpsllvq 2*32-96(%r8),$A31,@T[2]
vpsrlvq 2*32-96(%r9),$A31,$A31
vpor $A31,@T[2],@T[2] # $A31
vpor @T[2],$A31,$A31
vpxor $D14,$A21,$A21 # ^= D[1..4]
vpsllvq 3*32-96(%r8),$A21,@T[3]
vpsrlvq 3*32-96(%r9),$A21,$A21
vpor $A21,@T[3],@T[3] # $A21
vpor @T[3],$A21,$A21
vpermq \$0b10001101,@T[0],$A31 # $A20 -> $A31
vpermq \$0b10001101,@T[2],$A21 # $A31 -> $A21
vpxor $D14,$A41,$A41 # ^= D[1..4]
vpsllvq 4*32-96(%r8),$A41,@T[4]
vpsrlvq 4*32-96(%r9),$A41,$A41
vpor @T[4],$A41,$A41
vpxor $D14,$A01,$A01 # ^= D[1..4]
vpxor $D14,$A11,$T[6] # ^= D[1..4]
vpsllvq 1*32-96(%r8),$A01,@T[1]
vpsrlvq 1*32-96(%r9),$A01,$A01
vpor $A41,@T[4],@T[4] # $A41
vpor @T[1],$A01,$A20 # $A01 -> $A20
vpsllvq 5*32-96(%r8),$A11,@T[5]
vpsrlvq 5*32-96(%r9),$A11,$A11
vpor @T[5],$A11,$A11
vpermq \$0b00011011,@T[3],$A41 # $A21 -> $A41
vpermq \$0b01110010,@T[4],$A11 # $A41 -> $A11
vpsllvq 5*32-96(%r8),$T[6],@T[5]
vpsrlvq 5*32-96(%r9),@T[6],@T[6]
vpor @T[5],@T[6],$A01 # $A11 -> $A01
######################################### Pi + pre-Chi shuffle
vpermq \$0b01110010,$A41,@T[6] # vpermq \$0b00011011,$A41,$A11
vpermq \$0b00011011,$A21,@T[5] # vpermq \$0b01110010,$A21,$A41
vpermq \$0b10001101,$A31,@T[4] # vpermq \$0b10001101,$A31,$A21
vpermq \$0b10001101,$A20,@T[3] # vpermq \$0b01110010,$A20,$A31
vmovdqa $A01,@T[2]
vmovdqa $A11,@T[1]
######################################### Chi
vpsrldq \$8,$A01,@T[0]
vpandn @T[0],$A01,@T[0] # tgting [0][0]
vpermq \$0b00000000,@T[1],@T[0] # [0][1] [0][1] [0][1] [0][1]
vpermq \$0b01010101,@T[1],@T[7] # [0][2] [0][2] [0][2] [0][2]
vpandn @T[7],@T[0],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
vpermq \$0b00111001,$A01,@T[1] # [0][1] [0][4] [0][3] [0][2]
vpermq \$0b00011110,$A01,@T[8] # [0][1] [0][2] [0][4] [0][3]
vpblendd \$0b11000000,$A00,@T[1],@T[1] # [0][0] [0][4] [0][3] [0][2]
vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
vpermq \$0b00011110,@T[1],@T[8] # [0][1] [0][2] [0][4] [0][3]
vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
vpblendd \$0b00110000,$A00,@T[8],@T[8] # [0][1] [0][0] [0][4] [0][3]
vpxor @T[0],$A00,$A00 # broadcasted below
vpandn @T[8],@T[1],@T[1] # tgting [0][4] [0][3] [0][2] [0][1]
vpblendd \$0b00001100,$A41,$A21, @T[2] # [4][1] [2][1]
vpblendd \$0b00001100,$A21,$A11, @T[4] # [4][2] [2][2]
vpblendd \$0b00110000,$A11,@T[2],@T[2] # [1][1] [4][1] [2][1]
vpblendd \$0b00110000,$A31,@T[4],@T[4] # [1][2] [4][2] [2][2]
vpblendd \$0b11000000,$A31,@T[2],@T[2] # [3][1] [1][1] [4][1] [2][1]
vpblendd \$0b11000000,$A41,@T[4],@T[4] # [3][2] [1][2] [4][2] [2][2]
vpandn @T[4],@T[2],@T[2] # tgting [3][0] [1][0] [4][0] [2][0]
vpblendd \$0b00001100,$A11,$A20, @T[3] # [4][4] [2][0]
vpblendd \$0b00001100,$A20,$A21, @T[5] # [4][0] [2][1]
vpblendd \$0b00110000,$A21,@T[3],@T[3] # [1][3] [4][4] [2][0]
vpblendd \$0b00110000,$A41,@T[5],@T[5] # [1][4] [4][0] [2][1]
vpblendd \$0b11000000,$A41,@T[3],@T[3] # [3][2] [1][3] [4][4] [2][0]
vpblendd \$0b11000000,$A11,@T[5],@T[5] # [3][3] [1][4] [4][0] [2][1]
vpandn @T[5],@T[3],@T[3] # tgting [3][1] [1][2] [4][3] [2][4]
vpxor $A31,@T[3],@T[3]
vpblendd \$0b00001100,$A21,$A31, @T[5] # [4][2] [2][4]
vpblendd \$0b00001100,$A31,$A20, @T[6] # [4][3] [2][0]
vpblendd \$0b00110000,$A20,@T[5],@T[5] # [1][0] [4][2] [2][4]
vpblendd \$0b00110000,$A11,@T[6],@T[6] # [1][1] [4][3] [2][0]
vpblendd \$0b11000000,$A11,@T[5],@T[5] # [3][3] [1][0] [4][2] [2][4]
vpblendd \$0b11000000,$A21,@T[6],@T[6] # [3][4] [1][1] [4][3] [2][0]
vpandn @T[6],@T[5],@T[5] # tgting [3][2] [1][4] [4][1] [2][3]
vpxor $A41,@T[5],@T[5]
vpblendd \$0b00001100,$A20,$A41, @T[6] # [4][0] [2][3]
vpblendd \$0b00001100,$A41,$A31, @T[7] # [4][1] [2][4]
vpblendd \$0b00110000,$A31,@T[6],@T[6] # [1][2] [4][0] [2][3]
vpblendd \$0b00110000,$A21,@T[7],@T[7] # [1][3] [4][1] [2][4]
vpblendd \$0b11000000,$A21,@T[6],@T[6] # [3][4] [1][2] [4][0] [2][3]
vpblendd \$0b11000000,$A20,@T[7],@T[7] # [3][0] [1][3] [4][1] [2][4]
vpblendd \$0b00001100,$A31,$A41, @T[4] # [1][4] [4][3]
vpblendd \$0b11000000,$A31,$A41, @T[8] # [3][1] [2][3]
vpandn @T[7],@T[6],@T[6] # tgting [3][3] [1][1] [4][4] [2][2]
vpermq \$0b00011011,@T[3],$A31 ######### post-Chi shuffle
vpermq \$0b10001101,@T[5],$A41
vpxor $A11,@T[6],@T[6]
vpermq \$0b00000000,$A00,$A00 # broadcast A[0][0]
vpblendd \$0b00000011,$A11,@T[4],@T[4] # [1][4] [4][3] [2][2]
vpblendd \$0b00001100,$A11,@T[8],@T[8] # [3][1] [4][4] [2][3]
vpermq \$0b01110010,@T[6],$A11
vpblendd \$0b11000000,$A20,@T[4],@T[4] # [3][0] [1][4] [4][3] [2][2]
vpblendd \$0b00110000,$A20,@T[8],@T[8] # [3][1] [1][0] [4][4] [2][3]
vpandn @T[8],@T[4],@T[4] # tgting [3][4] [1][3] [4][2] [2][1]
vpxor @T[2],$A20,$A20
vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
vpblendd \$0b00001100,@T[3],@T[2],@T[8] # [4][3] [2][0]
vpblendd \$0b00110000,@T[6],@T[8],@T[8] # [1][1] [4][3] [2][0]
vpblendd \$0b11000000,@T[4],@T[8],@T[8] # [3][4] [1][1] [4][3] [2][0]
vpandn @T[8],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
vpblendd \$0b00001100,@T[5],@T[3],@T[7] # [4][1] [2][4]
vpblendd \$0b00110000,@T[4],@T[7],@T[7] # [1][3] [4][1] [2][4]
vpblendd \$0b11000000,@T[2],@T[7],@T[7] # [3][0] [1][3] [4][1] [2][4]
vpandn @T[7],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
vpxor @T[0],$A00,$A00
vpxor @T[1],$A01,$A01
vpxor @T[2],$A20,$A20
vpxor @T[3],$A31,$A31
vpxor @T[4],$A21,$A21
vpxor @T[5],$A41,$A41
vpxor @T[6],$A11,$A11
vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
vpermq \$0b10001101,$A41,$A41
vpermq \$0b01110010,$A11,$A11
######################################### Iota
vpxor (%r10),$A00,$A00
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册