Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
c4558efb
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
10
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c4558efb
编写于
2月 14, 2013
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
sha512-x86_64.pl: add AVX2 code path.
上级
750398ac
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
568 addition
and
82 deletion
+568
-82
crypto/sha/asm/sha512-x86_64.pl
crypto/sha/asm/sha512-x86_64.pl
+568
-82
未找到文件。
crypto/sha/asm/sha512-x86_64.pl
浏览文件 @
c4558efb
...
...
@@ -59,6 +59,15 @@
# higher coefficients are observed on VIA Nano and Bulldozer has more
# to do with specifics of their architecture [which is topic for
# separate discussion].
#
# November 2012.
#
# Add AVX2 code path. Two consecutive input blocks are loaded to
# 256-bit %ymm registers, with data from first block to least
# significant 128-bit halves and data from second to most significant.
# The data is then processed with same SIMD instruction sequence as
# for AVX, but with %ymm as operands. Side effect is increased stack
# frame, 448 additional bytes in SHA256 and 1152 in SHA512.
######################################################################
# Current performance in cycles per processed byte (less is better):
...
...
@@ -69,13 +78,13 @@
# P4 17.5 - - 33.4 -
# Core 2 15.5 13.9(+11%) - 10.3 -
# Westmere 15.1 12.5(+21%) - 9.72 -
# Atom 23.0 21.6(+6%) - 14.7 -
# VIA Nano 23.0 16.3(+41%) - 14.7 -
# Sandy Bridge 17.4 14.0(+24%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
# Ivy Bridge 12.6 10.3(+22%) 10.3(+22%) 8.17 7.22(+13%)
# Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%)
# VIA Nano 23.0 16.3(+41%) - 14.7 -
# Atom 23.0 21.6(+6%) - 14.7 -
#
# (*) whichever applicable;
# (*) whichever
best
applicable;
# (**) switch from ror to shrd stands for fair share of improvement;
# (***) execution time is fully determined by remaining integer-only
# part, body_00_15; reducing the amount of SIMD instructions
...
...
@@ -93,15 +102,20 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
(
$xlate
=
"
${dir}
../../perlasm/x86_64-xlate.pl
"
and
-
f
$xlate
)
or
die
"
can't locate x86_64-xlate.pl
";
$avx
=
1
if
(`
$ENV
{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1
`
=~
/GNU assembler version ([2-9]\.[0-9]+)/
&&
$
1
>=
2.19
);
$avx
=
1
if
(
!
$avx
&&
$win64
&&
(
$flavour
=~
/nasm/
||
$ENV
{
ASM
}
=~
/nasm/
)
&&
`
nasm -v 2>&1
`
=~
/NASM version ([2-9]\.[0-9]+)/
&&
$
1
>=
2.09
);
$avx
=
1
if
(
!
$avx
&&
$win64
&&
(
$flavour
=~
/masm/
||
$ENV
{
ASM
}
=~
/ml64/
)
&&
`
ml64 2>&1
`
=~
/Version ([0-9]+)\./
&&
$
1
>=
10
);
if
(`
$ENV
{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1
`
=~
/GNU assembler version ([2-9]\.[0-9]+)/
)
{
$avx
=
(
$
1
>=
2.19
)
+
(
$
1
>=
2.22
);
}
if
(
!
$avx
&&
$win64
&&
(
$flavour
=~
/nasm/
||
$ENV
{
ASM
}
=~
/nasm/
)
&&
`
nasm -v 2>&1
`
=~
/NASM version ([2-9]\.[0-9]+)/
)
{
$avx
=
(
$
1
>=
2.09
)
+
(
$
1
>=
2.10
);
}
if
(
!
$avx
&&
$win64
&&
(
$flavour
=~
/masm/
||
$ENV
{
ASM
}
=~
/ml64/
)
&&
`
ml64 2>&1
`
=~
/Version ([0-9]+)\./
)
{
$avx
=
(
$
1
>=
10
)
+
(
$
1
>=
11
);
}
open
OUT
,"
|
\"
$^X
\"
$xlate
$flavour
$output
";
*STDOUT
=
*OUT
;
...
...
@@ -145,6 +159,8 @@ $framesz="16*$SZ+4*8";
sub
ROUND_00_15
()
{
my
(
$i
,
$a
,
$b
,
$c
,
$d
,
$e
,
$f
,
$g
,
$h
)
=
@_
;
my
$STRIDE
=
$SZ
;
$STRIDE
+=
16
if
(
$i
%
(
16
/$SZ)==(16/
$SZ
-
1
));
$code
.=
<<___;
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
...
...
@@ -186,7 +202,7 @@ $code.=<<___ if ($i>=15);
mov
`
$SZ
*((
$i
+2)&0xf)
`(
%rsp
),
$a0
___
$code
.=
<<___;
lea $S
Z($Tbl),$Tbl
# round++
lea $S
TRIDE($Tbl),$Tbl
# round++
add $a1,$h # h+=Sigma0(a)
___
...
...
@@ -229,28 +245,34 @@ $code=<<___;
.extern OPENSSL_ia32cap_P
.globl $func
.type $func,\@function,
4
.type $func,\@function,
3
.align 16
$func:
___
$code
.=<<
___
if
(
$SZ
==
4
||
$avx
);
lea
OPENSSL_ia32cap_P
(
%rip
),
%r11
mov
0
(
%r11
),
%r10d
mov
4
(
%r11
),
%r11d
mov
0
(
%r11
),
%r9d
mov
4
(
%r11
),
%r10d
mov
8
(
%r11
),
%r11d
___
$code
.=<<
___
if
(
$avx
&&
$SZ
==
8
);
test
\
$
`
1<<11
`,
%r1
1
d
# check for XOP
test
\
$
`
1<<11
`,
%r1
0
d
# check for XOP
jnz
.
Lxop_shortcut
___
$code
.=<<
___
if
(
$avx
>
1
);
and
\
$
`
1<<8|1<<5|1<<3
`,
%r11d
# check for BMI2+AVX2+BMI1
cmp
\
$
`
1<<8|1<<5|1<<3
`,
%r11d
je
.
Lavx2_shortcut
___
$code
.=<<
___
if
(
$avx
);
and
\
$
`
1<<30
`,
%r
10
d
# mask "Intel CPU" bit
and
\
$
`
1<<28|1<<9
`,
%r1
1
d
# mask AVX and SSSE3 bits
or
%r
10d
,
%r11
d
cmp
\
$
`
1<<28|1<<9|1<<30
`,
%r1
1
d
and
\
$
`
1<<30
`,
%r
9
d
# mask "Intel CPU" bit
and
\
$
`
1<<28|1<<9
`,
%r1
0
d
# mask AVX and SSSE3 bits
or
%r
9d
,
%r10
d
cmp
\
$
`
1<<28|1<<9|1<<30
`,
%r1
0
d
je
.
Lavx_shortcut
___
$code
.=<<
___
if
(
$SZ
==
4
);
test
\
$
`
1<<9
`,
%r1
1
d
test
\
$
`
1<<9
`,
%r1
0
d
jnz
.
Lssse3_shortcut
___
$code
.=
<<___;
...
...
@@ -352,24 +374,43 @@ $code.=<<___;
.type $TABLE,\@object
$TABLE:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
...
...
@@ -379,48 +420,89 @@ $code.=<<___;
.type $TABLE,\@object
$TABLE:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.asciz "SHA512 block transfort for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
}
...
...
@@ -489,7 +571,7 @@ my @X = map("%xmm$_",(0..3));
my
(
$t0
,
$t1
,
$t2
,
$t3
,
$t4
,
$t5
)
=
map
("
%xmm
$_
",(
4
..
9
));
$code
.=
<<___;
.type ${func}_ssse3,\@function,
4
.type ${func}_ssse3,\@function,
3
.align 64
${func}_ssse3:
.Lssse3_shortcut:
...
...
@@ -529,12 +611,12 @@ $code.=<<___;
___
$code
.=
<<___;
movdqa $TABLE+`$SZ*
$rounds`+16
(%rip),$t4
movdqa $TABLE+`$SZ*
$rounds`+32
(%rip),$t5
movdqa $TABLE+`$SZ*
2*$rounds`+32
(%rip),$t4
movdqa $TABLE+`$SZ*
2*$rounds`+64
(%rip),$t5
jmp .Lloop_ssse3
.align 16
.Lloop_ssse3:
movdqa $TABLE+`$SZ*$rounds`(%rip),$t3
movdqa $TABLE+`$SZ*
2*
$rounds`(%rip),$t3
movdqu 0x00($inp),@X[0]
movdqu 0x10($inp),@X[1]
movdqu 0x20($inp),@X[2]
...
...
@@ -544,11 +626,11 @@ $code.=<<___;
pshufb $t3,@X[1]
movdqa 0x00($Tbl),$t0
pshufb $t3,@X[2]
movdqa 0x
1
0($Tbl),$t1
movdqa 0x
2
0($Tbl),$t1
paddd @X[0],$t0
movdqa 0x
2
0($Tbl),$t2
movdqa 0x
4
0($Tbl),$t2
pshufb $t3,@X[3]
movdqa 0x
3
0($Tbl),$t3
movdqa 0x
6
0($Tbl),$t3
paddd @X[1],$t1
paddd @X[2],$t2
paddd @X[3],$t3
...
...
@@ -564,7 +646,7 @@ $code.=<<___;
.align 16
.Lssse3_00_47:
add \$16*$SZ,$Tbl
sub \$-16*2*$SZ,$Tbl # size optimization
___
sub
Xupdate_256_SSSE3
()
{
(
...
...
@@ -601,7 +683,7 @@ sub Xupdate_256_SSSE3 () {
'
&pxor ($t3,$t2);
',
'
&psrlq ($t2,$sigma1[1]-$sigma1[0])
',
'
&pxor ($t3,$t2);
',
'
&movdqa ($t2,16*$j."($Tbl)")
',
'
&movdqa ($t2,16*
2*
$j."($Tbl)")
',
'
&pshufb ($t3,$t5)
',
'
&paddd (@X[0],$t3)
'
# X[2..3] += sigma1(X[16..17])
);
...
...
@@ -744,7 +826,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&pxor
(
$t3
,
$t2
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&movdqa
(
$t2
,
16
*$j
.
"
(
$Tbl
)
");
&movdqa
(
$t2
,
16
*
2
*
$j
.
"
(
$Tbl
)
");
eval
(
shift
(
@insns
));
#@
eval
(
shift
(
@insns
));
&pshufb
(
$t3
,
$t5
);
...
...
@@ -767,7 +849,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&SSSE3_256_00_47
(
$j
,
\
&body_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&cmpb
(
$SZ
-
1
+
16
*$SZ
.
"
(
$Tbl
)
",
0
);
&cmpb
(
$SZ
-
1
+
16
*
2
*
$SZ
.
"
(
$Tbl
)
",
0
);
&jne
("
.Lssse3_00_47
");
for
(
$i
=
0
;
$i
<
16
;
)
{
...
...
@@ -827,7 +909,7 @@ if ($avx) {{
#
if
(
$SZ
==
8
)
{
# SHA512 only
$code
.=
<<___;
.type ${func}_xop,\@function,
4
.type ${func}_xop,\@function,
3
.align 64
${func}_xop:
.Lxop_shortcut:
...
...
@@ -878,7 +960,7 @@ ___
$code
.=
<<___;
.align 16
.Lloop_xop:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3
vmovdqa $TABLE+`$SZ*
2*
$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0]
vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2]
...
...
@@ -889,9 +971,9 @@ $code.=<<___;
vpshufb $t3,@X[2],@X[2]
vpaddd 0x00($Tbl),@X[0],$t0
vpshufb $t3,@X[3],@X[3]
vpaddd 0x
1
0($Tbl),@X[1],$t1
vpaddd 0x
2
0($Tbl),@X[2],$t2
vpaddd 0x
3
0($Tbl),@X[3],$t3
vpaddd 0x
2
0($Tbl),@X[1],$t1
vpaddd 0x
4
0($Tbl),@X[2],$t2
vpaddd 0x
6
0($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
mov $A,$a1
vmovdqa $t1,0x10(%rsp)
...
...
@@ -904,7 +986,7 @@ $code.=<<___;
.align 16
.Lxop_00_47:
add \$16*$SZ,$Tbl
sub \$-16*2*$SZ,$Tbl # size optimization
___
sub
XOP_256_00_47
()
{
my
$j
=
shift
;
...
...
@@ -1001,7 +1083,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpaddd
(
$t2
,
@X
[
0
],
16
*$j
.
"
(
$Tbl
)
");
&vpaddd
(
$t2
,
@X
[
0
],
16
*
2
*
$j
.
"
(
$Tbl
)
");
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&vmovdqa
(
16
*$j
.
"
(%rsp)
",
$t2
);
}
...
...
@@ -1010,7 +1092,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&XOP_256_00_47
(
$j
,
\
&body_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&cmpb
(
$SZ
-
1
+
16
*$SZ
.
"
(
$Tbl
)
",
0
);
&cmpb
(
$SZ
-
1
+
16
*
2
*
$SZ
.
"
(
$Tbl
)
",
0
);
&jne
("
.Lxop_00_47
");
for
(
$i
=
0
;
$i
<
16
;
)
{
...
...
@@ -1024,9 +1106,9 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
$code
.=
<<___;
.align 16
.Lloop_xop:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3
vmovdqa $TABLE+`$SZ*
2*
$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0]
lea $TABLE
(%rip),$Tbl
lea $TABLE
+0x80(%rip),$Tbl # size optimization
vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2]
vpshufb $t3,@X[0],@X[0]
...
...
@@ -1040,20 +1122,20 @@ $code.=<<___;
vpshufb $t3,@X[4],@X[4]
vmovdqu 0x70($inp),@X[7]
vpshufb $t3,@X[5],@X[5]
vpaddq
0x0
0($Tbl),@X[0],$t0
vpaddq
-0x8
0($Tbl),@X[0],$t0
vpshufb $t3,@X[6],@X[6]
vpaddq
0x1
0($Tbl),@X[1],$t1
vpaddq
-0x6
0($Tbl),@X[1],$t1
vpshufb $t3,@X[7],@X[7]
vpaddq
0x2
0($Tbl),@X[2],$t2
vpaddq
0x3
0($Tbl),@X[3],$t3
vpaddq
-0x4
0($Tbl),@X[2],$t2
vpaddq
-0x2
0($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
vpaddq 0x
4
0($Tbl),@X[4],$t0
vpaddq 0x
0
0($Tbl),@X[4],$t0
vmovdqa $t1,0x10(%rsp)
vpaddq 0x
5
0($Tbl),@X[5],$t1
vpaddq 0x
2
0($Tbl),@X[5],$t1
vmovdqa $t2,0x20(%rsp)
vpaddq 0x
6
0($Tbl),@X[6],$t2
vpaddq 0x
4
0($Tbl),@X[6],$t2
vmovdqa $t3,0x30(%rsp)
vpaddq 0x
7
0($Tbl),@X[7],$t3
vpaddq 0x
6
0($Tbl),@X[7],$t3
vmovdqa $t0,0x40(%rsp)
mov $A,$a1
vmovdqa $t1,0x50(%rsp)
...
...
@@ -1066,7 +1148,7 @@ $code.=<<___;
.align 16
.Lxop_00_47:
add \$16*$SZ,$Tbl
add \$16*
2*
$SZ,$Tbl
___
sub
XOP_512_00_47
()
{
my
$j
=
shift
;
...
...
@@ -1129,7 +1211,7 @@ my @insns = (&$body,&$body); # 52 instructions
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpaddq
(
$t2
,
@X
[
0
],
16
*
$j
.
"
(
$Tbl
)
");
&vpaddq
(
$t2
,
@X
[
0
],
16
*
2
*$j
-
0x80
.
"
(
$Tbl
)
");
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&vmovdqa
(
16
*$j
.
"
(%rsp)
",
$t2
);
}
...
...
@@ -1138,7 +1220,7 @@ my @insns = (&$body,&$body); # 52 instructions
&XOP_512_00_47
(
$j
,
\
&body_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&cmpb
(
$SZ
-
1
+
16
*
$SZ
.
"
(
$Tbl
)
",
0
);
&cmpb
(
$SZ
-
1
+
16
*
2
*$SZ
-
0x80
.
"
(
$Tbl
)
",
0
);
&jne
("
.Lxop_00_47
");
for
(
$i
=
0
;
$i
<
16
;
)
{
...
...
@@ -1203,7 +1285,7 @@ ___
local
*ror
=
sub
{
&shrd
(
@_
[
0
],
@
_
)
};
$code
.=
<<___;
.type ${func}_avx,\@function,
4
.type ${func}_avx,\@function,
3
.align 64
${func}_avx:
.Lavx_shortcut:
...
...
@@ -1251,12 +1333,12 @@ ___
my
(
$t0
,
$t1
,
$t2
,
$t3
,
$t4
,
$t5
)
=
map
("
%xmm
$_
",(
4
..
9
));
$code
.=
<<___;
vmovdqa $TABLE+`$SZ*
$rounds`+16
(%rip),$t4
vmovdqa $TABLE+`$SZ*
$rounds`+32
(%rip),$t5
vmovdqa $TABLE+`$SZ*
2*$rounds`+32
(%rip),$t4
vmovdqa $TABLE+`$SZ*
2*$rounds`+64
(%rip),$t5
jmp .Lloop_avx
.align 16
.Lloop_avx:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3
vmovdqa $TABLE+`$SZ*
2*
$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0]
vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2]
...
...
@@ -1267,9 +1349,9 @@ $code.=<<___;
vpshufb $t3,@X[2],@X[2]
vpaddd 0x00($Tbl),@X[0],$t0
vpshufb $t3,@X[3],@X[3]
vpaddd 0x
1
0($Tbl),@X[1],$t1
vpaddd 0x
2
0($Tbl),@X[2],$t2
vpaddd 0x
3
0($Tbl),@X[3],$t3
vpaddd 0x
2
0($Tbl),@X[1],$t1
vpaddd 0x
4
0($Tbl),@X[2],$t2
vpaddd 0x
6
0($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
mov $A,$a1
vmovdqa $t1,0x10(%rsp)
...
...
@@ -1282,7 +1364,7 @@ $code.=<<___;
.align 16
.Lavx_00_47:
add \$16*$SZ,$Tbl
sub \$-16*2*$SZ,$Tbl # size optimization
___
sub
Xupdate_256_AVX
()
{
(
...
...
@@ -1330,7 +1412,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
}
&vpaddd
(
$t2
,
@X
[
0
],
16
*$j
.
"
(
$Tbl
)
");
&vpaddd
(
$t2
,
@X
[
0
],
16
*
2
*
$j
.
"
(
$Tbl
)
");
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&vmovdqa
(
16
*$j
.
"
(%rsp)
",
$t2
);
}
...
...
@@ -1339,7 +1421,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
&AVX_256_00_47
(
$j
,
\
&body_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&cmpb
(
$SZ
-
1
+
16
*$SZ
.
"
(
$Tbl
)
",
0
);
&cmpb
(
$SZ
-
1
+
16
*
2
*
$SZ
.
"
(
$Tbl
)
",
0
);
&jne
("
.Lavx_00_47
");
for
(
$i
=
0
;
$i
<
16
;
)
{
...
...
@@ -1354,9 +1436,9 @@ $code.=<<___;
jmp .Lloop_avx
.align 16
.Lloop_avx:
vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3
vmovdqa $TABLE+`$SZ*
2*
$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0]
lea $TABLE
(%rip),$Tbl
lea $TABLE
+0x80(%rip),$Tbl # size optimization
vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2]
vpshufb $t3,@X[0],@X[0]
...
...
@@ -1370,20 +1452,20 @@ $code.=<<___;
vpshufb $t3,@X[4],@X[4]
vmovdqu 0x70($inp),@X[7]
vpshufb $t3,@X[5],@X[5]
vpaddq
0x0
0($Tbl),@X[0],$t0
vpaddq
-0x8
0($Tbl),@X[0],$t0
vpshufb $t3,@X[6],@X[6]
vpaddq
0x1
0($Tbl),@X[1],$t1
vpaddq
-0x6
0($Tbl),@X[1],$t1
vpshufb $t3,@X[7],@X[7]
vpaddq
0x2
0($Tbl),@X[2],$t2
vpaddq
0x3
0($Tbl),@X[3],$t3
vpaddq
-0x4
0($Tbl),@X[2],$t2
vpaddq
-0x2
0($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
vpaddq 0x
4
0($Tbl),@X[4],$t0
vpaddq 0x
0
0($Tbl),@X[4],$t0
vmovdqa $t1,0x10(%rsp)
vpaddq 0x
5
0($Tbl),@X[5],$t1
vpaddq 0x
2
0($Tbl),@X[5],$t1
vmovdqa $t2,0x20(%rsp)
vpaddq 0x
6
0($Tbl),@X[6],$t2
vpaddq 0x
4
0($Tbl),@X[6],$t2
vmovdqa $t3,0x30(%rsp)
vpaddq 0x
7
0($Tbl),@X[7],$t3
vpaddq 0x
6
0($Tbl),@X[7],$t3
vmovdqa $t0,0x40(%rsp)
mov $A,$a1
vmovdqa $t1,0x50(%rsp)
...
...
@@ -1396,14 +1478,14 @@ $code.=<<___;
.align 16
.Lavx_00_47:
add \$16*$SZ,$Tbl
add \$16*
2*
$SZ,$Tbl
___
sub
Xupdate_512_AVX
()
{
(
'
&vpalignr ($t0,@X[1],@X[0],$SZ)
',
# X[1..2]
'
&vpalignr ($t3,@X[5],@X[4],$SZ)
',
# X[9..10]
'
&vpsrlq ($t2,$t0,$sigma0[0])
;
',
'
&vpaddq (@X[0],@X[0],$t3)
',
# X[0..1] += X[9..10]
'
&vpsrlq ($t2,$t0,$sigma0[0])
',
'
&vpaddq (@X[0],@X[0],$t3)
;
',
# X[0..1] += X[9..10]
'
&vpsrlq ($t3,$t0,$sigma0[2])
',
'
&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);
',
'
&vpxor ($t0,$t3,$t2)
',
...
...
@@ -1413,7 +1495,7 @@ sub Xupdate_512_AVX () {
'
&vpxor ($t0,$t0,$t2)
',
'
&vpsrlq ($t3,@X[7],$sigma1[2]);
',
'
&vpxor ($t0,$t0,$t1)
',
# sigma0(X[1..2])
'
&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1])
',
'
&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1])
;
',
'
&vpaddq (@X[0],@X[0],$t0)
',
# X[0..1] += sigma0(X[1..2])
'
&vpsrlq ($t1,@X[7],$sigma1[0]);
',
'
&vpxor ($t3,$t3,$t2)
',
...
...
@@ -1437,7 +1519,7 @@ my @insns = (&$body,&$body); # 52 instructions
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
}
&vpaddq
(
$t2
,
@X
[
0
],
16
*
$j
.
"
(
$Tbl
)
");
&vpaddq
(
$t2
,
@X
[
0
],
16
*
2
*$j
-
0x80
.
"
(
$Tbl
)
");
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&vmovdqa
(
16
*$j
.
"
(%rsp)
",
$t2
);
}
...
...
@@ -1446,7 +1528,7 @@ my @insns = (&$body,&$body); # 52 instructions
&AVX_512_00_47
(
$j
,
\
&body_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&cmpb
(
$SZ
-
1
+
16
*
$SZ
.
"
(
$Tbl
)
",
0
);
&cmpb
(
$SZ
-
1
+
16
*
2
*$SZ
-
0x80
.
"
(
$Tbl
)
",
0
);
&jne
("
.Lavx_00_47
");
for
(
$i
=
0
;
$i
<
16
;
)
{
...
...
@@ -1504,6 +1586,389 @@ $code.=<<___;
ret
.size ${func}_avx,.-${func}_avx
___
if
(
$avx
>
1
)
{{
######################################################################
# AVX2+BMI code path
#
my
$a5
=
$SZ
==
4
?"
%esi
":"
%rsi
";
# zap $inp
my
$PUSH8
=
8
*
2
*$SZ
;
use
integer
;
sub
bodyx_00_15
()
{
# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
(
'
($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;
'
.
'
&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)
',
# h+=X[i]+K[i]
'
&and ($a4,$e)
',
# f&e
'
&rorx ($a0,$e,$Sigma1[2])
',
'
&rorx ($a2,$e,$Sigma1[1])
',
'
&lea ($a,"($a,$a1)")
',
# h+=Sigma0(a) from the past
'
&lea ($h,"($h,$a4)")
',
'
&andn ($a4,$e,$g)
',
# ~e&g
'
&xor ($a0,$a2)
',
'
&rorx ($a1,$e,$Sigma1[0])
',
'
&lea ($h,"($h,$a4)")
',
# h+=Ch(e,f,g)=(e&f)+(~e&g)
'
&xor ($a0,$a1)
',
# Sigma1(e)
'
&mov ($a2,$a)
',
'
&rorx ($a4,$a,$Sigma0[2])
',
'
&lea ($h,"($h,$a0)")
',
# h+=Sigma1(e)
'
&xor ($a2,$b)
',
# a^b, b^c in next round
'
&rorx ($a1,$a,$Sigma0[1])
',
'
&rorx ($a0,$a,$Sigma0[0])
',
'
&lea ($d,"($d,$h)")
',
# d+=h
'
&and ($a3,$a2)
',
# (b^c)&(a^b)
'
&xor ($a1,$a4)
',
'
&xor ($a3,$b)
',
# Maj(a,b,c)=Ch(a^b,c,b)
'
&xor ($a1,$a0)
',
# Sigma0(a)
'
&lea ($h,"($h,$a3)");
'
.
# h+=Maj(a,b,c)
'
&mov ($a4,$e)
',
# copy of f in future
'
($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;
'
);
# and at the finish one has to $a+=$a1
}
$code
.=
<<___;
.type ${func}_avx2,\@function,3
.align 64
${func}_avx2:
.Lavx2_shortcut:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
shl \$4,%rdx # num*16
and \$-256*$SZ,%rsp # align stack frame
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
add \$`2*$SZ*($rounds-8)`,%rsp
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
___
$code
.=<<
___
if
(
$win64
);
movaps
%xmm6
,
16
*$SZ
+
32
(
%rsp
)
movaps
%xmm7
,
16
*$SZ
+
48
(
%rsp
)
movaps
%xmm8
,
16
*$SZ
+
64
(
%rsp
)
movaps
%xmm9
,
16
*$SZ
+
80
(
%rsp
)
___
$code
.=<<
___
if
(
$win64
&&
$SZ
>
4
);
movaps
%xmm10
,
16
*$SZ
+
96
(
%rsp
)
movaps
%xmm11
,
16
*$SZ
+
112
(
%rsp
)
___
$code
.=
<<___;
.Lprologue_avx2:
vzeroall
sub \$-16*$SZ,$inp # inp++, size optimization
mov $SZ*0($ctx),$A
xor %r12,%r12 # borrow $T1
mov $SZ*1($ctx),$B
cmp %rdx,$inp # $_end
mov $SZ*2($ctx),$C
sete %r12b
mov $SZ*3($ctx),$D
mov $SZ*4($ctx),$E
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H
___
if
(
$SZ
==
4
)
{
# SHA256
my
@X
=
map
("
%ymm
$_
",(
0
..
3
));
my
(
$t0
,
$t1
,
$t2
,
$t3
,
$t4
,
$t5
)
=
map
("
%ymm
$_
",(
4
..
9
));
$code
.=
<<___;
vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
jmp .Loop_avx2
.align 16
.Loop_avx2:
shl \$`log(16*$SZ)/log(2)`,%r12
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
neg %r12
vmovdqu -16*$SZ+0($inp),$t0
add $inp,%r12
vmovdqu -16*$SZ+32($inp),$t1
vmovdqu (%r12),@X[2] # next or same input block
vmovdqu 32(%r12),@X[3]
vperm2i128 \$0x20,@X[2],$t0,@X[0]
#mov $inp,$_inp # offload $inp
vperm2i128 \$0x31,@X[2],$t0,@X[1]
vperm2i128 \$0x20,@X[3],$t1,@X[2]
vperm2i128 \$0x31,@X[3],$t1,@X[3]
lea $TABLE(%rip),$Tbl
vpshufb $t3,@X[0],@X[0]
vpshufb $t3,@X[1],@X[1]
vpshufb $t3,@X[2],@X[2]
vpaddd 0x00($Tbl),@X[0],$t0
vpshufb $t3,@X[3],@X[3]
vpaddd 0x20($Tbl),@X[1],$t1
vpaddd 0x40($Tbl),@X[2],$t2
vpaddd 0x60($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
xor $a1,$a1
vmovdqa $t1,0x20(%rsp)
lea -$PUSH8(%rsp),%rsp
mov $B,$a3
vmovdqa $t2,0x00(%rsp)
xor $C,$a3 # magic
vmovdqa $t3,0x20(%rsp)
mov $F,$a4
sub \$-16*2*$SZ,$Tbl # size optimization
jmp .Lavx2_00_47
.align 16
.Lavx2_00_47:
___
sub
AVX2_256_00_47
()
{
my
$j
=
shift
;
my
$body
=
shift
;
my
@X
=
@_
;
my
@insns
=
(
&$body
,
&$body
,
&$body
,
&$body
);
# 96 instructions
my
$base
=
"
+2*
$PUSH8
(%rsp)
";
&lea
("
%rsp
","
-
$PUSH8
(%rsp)
")
if
((
$j
%
2
)
==
0
);
foreach
(
Xupdate_256_AVX
())
{
# 29 instructions
eval
;
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
}
&vpaddd
(
$t2
,
@X
[
0
],
16
*
2
*$j
.
"
(
$Tbl
)
");
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&vmovdqa
((
32
*$j
)
%$PUSH8
.
"
(%rsp)
",
$t2
);
}
for
(
$i
=
0
,
$j
=
0
;
$j
<
4
;
$j
++
)
{
&AVX2_256_00_47
(
$j
,
\
&bodyx_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&lea
(
$Tbl
,
16
*
2
*$SZ
.
"
(
$Tbl
)
");
&cmpb
((
$SZ
-
1
)
.
"
(
$Tbl
)
",
0
);
&jne
("
.Lavx2_00_47
");
for
(
$i
=
0
;
$i
<
16
;
)
{
my
$base
=
$i
<
8
?"
+
$PUSH8
(%rsp)
":"
(%rsp)
";
foreach
(
bodyx_00_15
())
{
eval
;
}
}
}
else
{
# SHA512
my
@X
=
map
("
%ymm
$_
",(
0
..
7
));
my
(
$t0
,
$t1
,
$t2
,
$t3
)
=
map
("
%ymm
$_
",(
8
..
11
));
$code
.=
<<___;
jmp .Loop_avx2
.align 16
.Loop_avx2:
shl \$`log(16*$SZ)/log(2)`,%r12
vmovdqu -16*$SZ($inp),$t0
neg %r12
vmovdqu -16*$SZ+32($inp),$t1
add $inp,%r12
vmovdqu -16*$SZ+64($inp),$t2
vmovdqu -16*$SZ+96($inp),$t3
vmovdqu (%r12),@X[4] # next or same block
vmovdqu 32(%r12),@X[5]
vmovdqu 64(%r12),@X[6]
vmovdqu 96(%r12),@X[7]
vperm2i128 \$0x20,@X[4],$t0,@X[0]
#mov $inp,$_inp # offload $inp
vperm2i128 \$0x31,@X[4],$t0,@X[1]
vperm2i128 \$0x20,@X[5],$t1,@X[2]
vperm2i128 \$0x31,@X[5],$t1,@X[3]
vperm2i128 \$0x20,@X[6],$t2,@X[4]
vperm2i128 \$0x31,@X[6],$t2,@X[5]
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t2
vperm2i128 \$0x20,@X[7],$t3,@X[6]
vperm2i128 \$0x31,@X[7],$t3,@X[7]
vpshufb $t2,@X[0],@X[0]
lea $TABLE+0x80(%rip),$Tbl # size optimization
vpshufb $t2,@X[1],@X[1]
vpshufb $t2,@X[2],@X[2]
vpshufb $t2,@X[3],@X[3]
vpshufb $t2,@X[4],@X[4]
vpshufb $t2,@X[5],@X[5]
vpaddq -0x80($Tbl),@X[0],$t0
vpshufb $t2,@X[6],@X[6]
vpaddq -0x60($Tbl),@X[1],$t1
vpshufb $t2,@X[7],@X[7]
vpaddq -0x40($Tbl),@X[2],$t2
vpaddq -0x20($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
vpaddq 0x00($Tbl),@X[4],$t0
vmovdqa $t1,0x20(%rsp)
vpaddq 0x20($Tbl),@X[5],$t1
vmovdqa $t2,0x40(%rsp)
vpaddq 0x40($Tbl),@X[6],$t2
vmovdqa $t3,0x60(%rsp)
lea -$PUSH8(%rsp),%rsp
vpaddq 0x60($Tbl),@X[7],$t3
vmovdqa $t0,0x00(%rsp)
xor $a1,$a1
vmovdqa $t1,0x20(%rsp)
mov $B,$a3
vmovdqa $t2,0x40(%rsp)
xor $C,$a3 # magic
vmovdqa $t3,0x60(%rsp)
mov $F,$a4
add \$16*2*$SZ,$Tbl
jmp .Lavx2_00_47
.align 16
.Lavx2_00_47:
___
sub
AVX2_512_00_47
()
{
my
$j
=
shift
;
my
$body
=
shift
;
my
@X
=
@_
;
my
@insns
=
(
&$body
,
&$body
);
# 48 instructions
my
$base
=
"
+2*
$PUSH8
(%rsp)
";
&lea
("
%rsp
","
-
$PUSH8
(%rsp)
")
if
((
$j
%
4
)
==
0
);
foreach
(
Xupdate_512_AVX
())
{
# 23 instructions
eval
;
if
(
$_
!~
/\;$/
)
{
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
}
}
&vpaddq
(
$t2
,
@X
[
0
],
16
*
2
*$j
-
0x80
.
"
(
$Tbl
)
");
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&vmovdqa
((
32
*$j
)
%$PUSH8
.
"
(%rsp)
",
$t2
);
}
for
(
$i
=
0
,
$j
=
0
;
$j
<
8
;
$j
++
)
{
&AVX2_512_00_47
(
$j
,
\
&bodyx_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&lea
(
$Tbl
,
16
*
2
*$SZ
.
"
(
$Tbl
)
");
&cmpb
((
$SZ
-
1
-
0x80
)
.
"
(
$Tbl
)
",
0
);
&jne
("
.Lavx2_00_47
");
for
(
$i
=
0
;
$i
<
16
;
)
{
my
$base
=
$i
<
8
?"
+
$PUSH8
(%rsp)
":"
(%rsp)
";
foreach
(
bodyx_00_15
())
{
eval
;
}
}
}
$code
.=
<<___;
mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
add $a1,$A
#mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
add $SZ*0($ctx),$A
add $SZ*1($ctx),$B
add $SZ*2($ctx),$C
add $SZ*3($ctx),$D
add $SZ*4($ctx),$E
add $SZ*5($ctx),$F
add $SZ*6($ctx),$G
add $SZ*7($ctx),$H
mov $A,$SZ*0($ctx)
mov $B,$SZ*1($ctx)
mov $C,$SZ*2($ctx)
mov $D,$SZ*3($ctx)
mov $E,$SZ*4($ctx)
mov $F,$SZ*5($ctx)
mov $G,$SZ*6($ctx)
mov $H,$SZ*7($ctx)
cmp `$PUSH8+2*8`($Tbl),$inp # $_end
je .Ldone_avx2
xor $a1,$a1
mov $B,$a3
xor $C,$a3 # magic
mov $F,$a4
jmp .Lower_avx2
.align 16
.Lower_avx2:
___
for
(
$i
=
0
;
$i
<
8
;
)
{
my
$base
=
"
+16(
$Tbl
)
";
foreach
(
bodyx_00_15
())
{
eval
;
}
}
$code
.=
<<___;
lea -$PUSH8($Tbl),$Tbl
cmp %rsp,$Tbl
jae .Lower_avx2
mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
add $a1,$A
#mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
lea `2*$SZ*($rounds-8)`(%rsp),%rsp
add $SZ*0($ctx),$A
add $SZ*1($ctx),$B
add $SZ*2($ctx),$C
add $SZ*3($ctx),$D
add $SZ*4($ctx),$E
add $SZ*5($ctx),$F
lea `2*16*$SZ`($inp),$inp # inp+=2
add $SZ*6($ctx),$G
xor %r12,%r12
add $SZ*7($ctx),$H
cmp $_end,$inp
mov $A,$SZ*0($ctx)
mov $B,$SZ*1($ctx)
mov $C,$SZ*2($ctx)
mov $D,$SZ*3($ctx)
mov $E,$SZ*4($ctx)
mov $F,$SZ*5($ctx)
mov $G,$SZ*6($ctx)
mov $H,$SZ*7($ctx)
sete %r12b
jbe .Loop_avx2
lea (%rsp),$Tbl
.Ldone_avx2:
lea ($Tbl),%rsp
mov $_rsp,%rsi
vzeroall
___
$code
.=<<
___
if
(
$win64
);
movaps
16
*$SZ
+
32
(
%rsp
),
%xmm6
movaps
16
*$SZ
+
48
(
%rsp
),
%xmm7
movaps
16
*$SZ
+
64
(
%rsp
),
%xmm8
movaps
16
*$SZ
+
80
(
%rsp
),
%xmm9
___
$code
.=<<
___
if
(
$win64
&&
$SZ
>
4
);
movaps
16
*$SZ
+
96
(
%rsp
),
%xmm10
movaps
16
*$SZ
+
112
(
%rsp
),
%xmm11
___
$code
.=
<<___;
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.Lepilogue_avx2:
ret
.size ${func}_avx2,.-${func}_avx2
___
}}
}}}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
...
...
@@ -1547,7 +2012,17 @@ se_handler:
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
___
$code
.=<<
___
if
(
$avx
>
1
);
lea
.
Lavx2_shortcut
(
%rip
),
%r10
cmp
%r10
,
%rbx
# context->Rip<avx2_shortcut
jb
.
Lnot_in_avx2
and
\
$
-
256
*$SZ
,
%rax
add
\
$
`
2*
$SZ
*(
$rounds
-8)
`,
%rax
.
Lnot_in_avx2:
___
$code
.=
<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
lea 48(%rax),%rax
...
...
@@ -1635,6 +2110,11 @@ $code.=<<___ if ($avx);
.
rva
.
LSEH_end_$
{
func
}
_avx
.
rva
.
LSEH_info_$
{
func
}
_avx
___
$code
.=<<
___
if
(
$avx
>
1
);
.
rva
.
LSEH_begin_$
{
func
}
_avx2
.
rva
.
LSEH_end_$
{
func
}
_avx2
.
rva
.
LSEH_info_$
{
func
}
_avx2
___
$code
.=
<<___;
.section .xdata
.align 8
...
...
@@ -1661,6 +2141,12 @@ $code.=<<___ if ($avx);
.
rva
se_handler
.
rva
.
Lprologue_avx
,
.
Lepilogue_avx
# HandlerData[]
___
$code
.=<<
___
if
(
$avx
>
1
);
.
LSEH_info_$
{
func
}
_avx2:
.
byte
9
,
0
,
0
,
0
.
rva
se_handler
.
rva
.
Lprologue_avx2
,
.
Lepilogue_avx2
# HandlerData[]
___
}
$code
=~
s/\`([^\`]*)\`/eval $1/g
em
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录