Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
32213d8d
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
大约 1 年 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
32213d8d
编写于
6月 10, 2013
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
sha[256|512]-586.pl: add more SIMD code paths.
上级
b4275915
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
617 addition
and
127 deletion
+617
-127
crypto/sha/asm/sha256-586.pl
crypto/sha/asm/sha256-586.pl
+344
-103
crypto/sha/asm/sha512-586.pl
crypto/sha/asm/sha512-586.pl
+273
-24
未找到文件。
crypto/sha/asm/sha256-586.pl
浏览文件 @
32213d8d
...
...
@@ -28,18 +28,31 @@
# May version, >60% over original. Add AVX+shrd code path, >25%
# improvement on Sandy Bridge over May version, 60% over original.
#
# May 2013.
#
# Replace AMD XOP code path with SSSE3 to cover more processors.
# (Biggest improvement coefficient is on upcoming Atom Silvermont,
# not shown.) Add AVX+BMI code path.
#
# Performance in clock cycles per processed byte (less is better):
#
# PIII P4 AMD K8 Core2 SB Atom Bldzr
# gcc 36 41 27 26 25 50 36
# icc 33 38 25 23 - - -
# x86 asm(*) 27/24 28 19/15.5 18/15.6 12.3 30/25 16.6
# x86_64 asm(**) 17.5 15.1 13.9 11.6 22 13.7
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# Pentium 46 57 40/38 - -
# PIII 36 33 27/24 - -
# P4 41 38 28 - 17.3
# AMD K8 27 25 19/15.5 - 14.9
# Core2 26 23 18/15.6 14.3 13.8
# Westmere 27 - 19/15.7 13.4 12.3
# Sandy Bridge 25 - 15.9 12.4 11.6
# Ivy Bridge 24 - 15.0 11.4 10.3
# Haswell 22 - 13.9 9.46 7.80
# Bulldozer 36 - 27/22 17.0 13.6
# VIA Nano 36 - 25/22 16.8 16.5
# Atom 50 - 30/25 21.9 18.9
#
# (*) numbers after slash are for unrolled loop, where available,
# otherwise best applicable such as AVX/XOP;
# (*) numbers after slash are for unrolled loop, where applicable;
# (**) x86_64 assembly performance is presented for reference
# purposes
.
# purposes
, results are best-available;
$
0
=~
m/(.*[\/\\])[^\/\\]+$/
;
$dir
=
$
1
;
push
(
@INC
,"
${dir}
","
${dir}
../../perlasm
");
...
...
@@ -47,21 +60,23 @@ require "x86asm.pl";
&asm_init
(
$ARGV
[
0
],"
sha512-586.pl
",
$ARGV
[
$#ARGV
]
eq
"
386
");
$xmm
=
$
ymm
=
0
;
$xmm
=
$
avx
=
0
;
for
(
@ARGV
)
{
$xmm
=
1
if
(
/-DOPENSSL_IA32_SSE2/
);
}
$ymm
=
1
if
(
$xmm
&&
`
$ENV
{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1
`
=~
/GNU assembler version ([2-9]\.[0-9]+)/
&&
$
1
>=
2.19
);
# first version supporting AVX
if
(
$xmm
&&
`
$ENV
{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1
`
=~
/GNU assembler version ([2-9]\.[0-9]+)/
)
{
$avx
=
(
$
1
>=
2.19
)
+
(
$
1
>=
2.22
);
}
$ymm
=
1
if
(
$xmm
&&
!
$ymm
&&
$ARGV
[
0
]
eq
"
win32n
"
&&
`
nasm -v 2>&1
`
=~
/NASM version ([2-9]\.[0-9]+)/
&&
$
1
>=
2.03
);
# first version supporting AVX
if
(
$xmm
&&
!
$avx
&&
$ARGV
[
0
]
eq
"
win32n
"
&&
`
nasm -v 2>&1
`
=~
/NASM version ([2-9]\.[0-9]+)/
)
{
$avx
=
(
$
1
>=
2.03
)
+
(
$
1
>=
2.10
);
}
$ymm
=
1
if
(
$xmm
&&
!
$ymm
&&
$ARGV
[
0
]
eq
"
win32
"
&&
`
ml 2>&1
`
=~
/Version ([0-9]+)\./
&&
$
1
>=
10
);
# first version supporting AVX
if
(
$xmm
&&
!
$avx
&&
$ARGV
[
0
]
eq
"
win32
"
&&
`
ml 2>&1
`
=~
/Version ([0-9]+)\./
)
{
$avx
=
(
$
1
>=
10
)
+
(
$
1
>=
11
);
}
$unroll_after
=
64
*
4
;
# If pre-evicted from L1P cache first spin of
# fully unrolled loop was measured to run about
...
...
@@ -178,17 +193,21 @@ sub BODY_00_15() {
if
(
!
$i386
)
{
&picmeup
("
edx
","
OPENSSL_ia32cap_P
",
$K256
,
&label
("
K256
"));
&mov
("
ecx
",
&DWP
(
0
,"
edx
"));
&mov
("
e
d
x
",
&DWP
(
4
,"
edx
"));
&mov
("
e
b
x
",
&DWP
(
4
,"
edx
"));
&test
("
ecx
",
1
<<
20
);
# check for P4
&jnz
(
&label
("
loop
"));
&test
("
edx
",
1
<<
11
);
# check for XOP
&jnz
(
&label
("
XOP
"))
if
(
$ymm
);
&and
("
ecx
",
1
<<
30
);
# mask "Intel CPU" bit
&and
("
edx
",
1
<<
28
);
# mask AVX bit
&or
("
ecx
","
edx
");
&and
("
ebx
",
1
<<
28
|
1
<<
9
);
# mask AVX and SSSE3 bits
&or
("
ecx
","
ebx
");
&and
("
ecx
",
1
<<
28
|
1
<<
30
);
&cmp
("
ecx
",
1
<<
28
|
1
<<
30
);
&je
(
&label
("
AVX
"))
if
(
$ymm
);
&je
(
&label
("
loop_shrd
"))
if
(
!
$ymm
);
if
(
$xmm
)
{
&je
(
&label
("
AVX
"))
if
(
$avx
);
&test
("
ebx
",
1
<<
9
);
# check for SSSE3
&jnz
(
&label
("
SSSE3
"));
}
else
{
&je
(
&label
("
loop_shrd
"));
}
if
(
$unroll_after
)
{
&sub
("
eax
","
edi
");
&cmp
("
eax
",
$unroll_after
);
...
...
@@ -199,7 +218,7 @@ sub BODY_00_15() {
sub
COMPACT_LOOP
()
{
my
$suffix
=
shift
;
&set_label
("
loop
$suffix
",
16
);
&set_label
("
loop
$suffix
",
$suffix
?
32
:
16
);
# copy input block to stack reversing byte and dword order
for
(
$i
=
0
;
$i
<
4
;
$i
++
)
{
&mov
("
eax
",
&DWP
(
$i
*
16
+
0
,"
edi
"));
...
...
@@ -292,7 +311,7 @@ my $suffix=shift;
&COMPACT_LOOP
();
&mov
("
esp
",
&DWP
(
12
,"
esp
"));
# restore sp
&function_end_A
();
if
(
!
$i386
&&
!
$
y
mm
)
{
if
(
!
$i386
&&
!
$
x
mm
)
{
# ~20% improvement on Sandy Bridge
local
*ror
=
sub
{
&shrd
(
@_
[
0
],
@
_
)
};
&COMPACT_LOOP
("
_shrd
");
...
...
@@ -318,7 +337,11 @@ my $suffix=shift;
0x748f82ee
,
0x78a5636f
,
0x84c87814
,
0x8cc70208
,
0x90befffa
,
0xa4506ceb
,
0xbef9a3f7
,
0xc67178f2
);
&data_word
(
@K256
);
&data_word
(
0x00010203
,
0x04050607
,
0x08090a0b
,
0x0c0d0e0f
);
&data_word
(
0x00010203
,
0x04050607
,
0x08090a0b
,
0x0c0d0e0f
);
# byte swap mask
&asciz
("
SHA256 block transform for x86, CRYPTOGAMS by <appro
\@
openssl.org>
");
(
$a
,
$b
,
$c
,
$d
,
$e
,
$f
,
$g
,
$h
)
=
(
0
..
7
);
# offsets
sub
off
{
&DWP
(
4
*
(((
shift
)
-
$i
)
&
7
),"
esp
");
}
if
(
!
$i386
&&
$unroll_after
)
{
my
@AH
=
(
$A
,
$K256
);
...
...
@@ -365,8 +388,6 @@ my @AH=($A,$K256);
&mov
(
&DWP
(
32
+
12
*$i
,"
esp
"),"
ebx
");
my
(
$t1
,
$t2
)
=
("
ecx
","
esi
");
my
(
$a
,
$b
,
$c
,
$d
,
$e
,
$f
,
$g
,
$h
)
=
(
0
..
7
);
# offsets
sub
off
{
&DWP
(
4
*
(((
shift
)
-
$i
)
&
7
),"
esp
");
}
for
(
$i
=
0
;
$i
<
64
;
$i
++
)
{
...
...
@@ -472,15 +493,14 @@ my @AH=($A,$K256);
&mov
("
esp
",
&DWP
(
96
+
12
,"
esp
"));
# restore sp
&function_end_A
();
if
(
$y
mm
)
{{{
}
if
(
!
$i386
&&
$x
mm
)
{{{
my
@X
=
map
("
xmm
$_
",(
0
..
3
));
my
(
$t0
,
$t1
,
$t2
,
$t3
)
=
map
("
xmm
$_
",(
4
..
7
));
my
@AH
=
(
$A
,
$T
);
&set_label
("
XOP
",
16
);
&set_label
("
SSSE3
",
32
);
&lea
("
esp
",
&DWP
(
-
96
,"
esp
"));
&vzeroall
();
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&mov
(
$AH
[
0
],
&DWP
(
0
,"
esi
"));
&mov
(
$AH
[
1
],
&DWP
(
4
,"
esi
"));
...
...
@@ -500,135 +520,208 @@ my @AH = ($A,$T);
&mov
("
edi
",
&DWP
(
96
+
4
,"
esp
"));
# inp
&mov
(
&DWP
(
24
,"
esp
"),"
ecx
");
&mov
(
&DWP
(
28
,"
esp
"),"
esi
");
&
v
movdqa
(
$t3
,
&QWP
(
256
,
$K256
));
&jmp
(
&label
("
grand_
xop
"));
&movdqa
(
$t3
,
&QWP
(
256
,
$K256
));
&jmp
(
&label
("
grand_
ssse3
"));
&set_label
("
grand_
xop
",
16
);
&set_label
("
grand_
ssse3
",
16
);
# load input, reverse byte order, add K256[0..15], save to stack
&vmovdqu
(
@X
[
0
],
&QWP
(
0
,"
edi
"));
&vmovdqu
(
@X
[
1
],
&QWP
(
16
,"
edi
"));
&vmovdqu
(
@X
[
2
],
&QWP
(
32
,"
edi
"));
&vmovdqu
(
@X
[
3
],
&QWP
(
48
,"
edi
"));
&add
("
edi
",
64
);
&vpshufb
(
@X
[
0
],
@X
[
0
],
$t3
);
&mov
(
&DWP
(
96
+
4
,"
esp
"),"
edi
");
&vpshufb
(
@X
[
1
],
@X
[
1
],
$t3
);
&vpshufb
(
@X
[
2
],
@X
[
2
],
$t3
);
&vpaddd
(
$t0
,
@X
[
0
],
&QWP
(
0
,
$K256
));
&vpshufb
(
@X
[
3
],
@X
[
3
],
$t3
);
&vpaddd
(
$t1
,
@X
[
1
],
&QWP
(
16
,
$K256
));
&vpaddd
(
$t2
,
@X
[
2
],
&QWP
(
32
,
$K256
));
&vpaddd
(
$t3
,
@X
[
3
],
&QWP
(
48
,
$K256
));
&vmovdqa
(
&QWP
(
32
+
0
,"
esp
"),
$t0
);
&vmovdqa
(
&QWP
(
32
+
16
,"
esp
"),
$t1
);
&vmovdqa
(
&QWP
(
32
+
32
,"
esp
"),
$t2
);
&vmovdqa
(
&QWP
(
32
+
48
,"
esp
"),
$t3
);
&jmp
(
&label
("
xop_00_47
"));
&set_label
("
xop_00_47
",
16
);
&movdqu
(
@X
[
0
],
&QWP
(
0
,"
edi
"));
&movdqu
(
@X
[
1
],
&QWP
(
16
,"
edi
"));
&movdqu
(
@X
[
2
],
&QWP
(
32
,"
edi
"));
&movdqu
(
@X
[
3
],
&QWP
(
48
,"
edi
"));
&add
("
edi
",
64
);
&pshufb
(
@X
[
0
],
$t3
);
&mov
(
&DWP
(
96
+
4
,"
esp
"),"
edi
");
&pshufb
(
@X
[
1
],
$t3
);
&movdqa
(
$t0
,
&QWP
(
0
,
$K256
));
&pshufb
(
@X
[
2
],
$t3
);
&movdqa
(
$t1
,
&QWP
(
16
,
$K256
));
&paddd
(
$t0
,
@X
[
0
]);
&pshufb
(
@X
[
3
],
$t3
);
&movdqa
(
$t2
,
&QWP
(
32
,
$K256
));
&paddd
(
$t1
,
@X
[
1
]);
&movdqa
(
$t3
,
&QWP
(
48
,
$K256
));
&movdqa
(
&QWP
(
32
+
0
,"
esp
"),
$t0
);
&paddd
(
$t2
,
@X
[
2
]);
&movdqa
(
&QWP
(
32
+
16
,"
esp
"),
$t1
);
&paddd
(
$t3
,
@X
[
3
]);
&movdqa
(
&QWP
(
32
+
32
,"
esp
"),
$t2
);
&movdqa
(
&QWP
(
32
+
48
,"
esp
"),
$t3
);
&jmp
(
&label
("
ssse3_00_47
"));
&set_label
("
ssse3_00_47
",
16
);
&add
(
$K256
,
64
);
sub
XOP
_00_47
()
{
sub
SSSE3
_00_47
()
{
my
$j
=
shift
;
my
$body
=
shift
;
my
@X
=
@_
;
my
@insns
=
(
&$body
,
&$body
,
&$body
,
&$body
);
# 120 instructions
&vpalignr
(
$t0
,
@X
[
1
],
@X
[
0
],
4
);
# X[1..4]
eval
(
shift
(
@insns
));
&movdqa
(
$t0
,
@X
[
1
]);
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&vpalignr
(
$t3
,
@X
[
3
],
@X
[
2
],
4
);
# X[9..12]
&movdqa
(
$t3
,
@X
[
3
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&
vprotd
(
$t1
,
$t0
,
14
);
&
palignr
(
$t0
,
@X
[
0
],
4
);
# X[1..4]
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&vpsrld
(
$t0
,
$t0
,
3
);
&vpaddd
(
@X
[
0
],
@X
[
0
],
$t3
);
# X[0..3] += X[9..12]
&palignr
(
$t3
,
@X
[
2
],
4
);
# X[9..12]
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&movdqa
(
$t1
,
$t0
);
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&vprotd
(
$t2
,
$t1
,
25
-
14
);
&vpxor
(
$t0
,
$t0
,
$t1
);
&movdqa
(
$t2
,
$t0
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&psrld
(
$t0
,
3
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&paddd
(
@X
[
0
],
$t3
);
# X[0..3] += X[9..12]
eval
(
shift
(
@insns
));
&vprotd
(
$t3
,
@X
[
3
],
13
);
&vpxor
(
$t0
,
$t0
,
$t2
);
# sigma0(X[1..4])
eval
(
shift
(
@insns
));
&psrld
(
$t2
,
7
);
eval
(
shift
(
@insns
));
&vpsrld
(
$t2
,
@X
[
3
],
10
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&vpaddd
(
@X
[
0
],
@X
[
0
],
$t0
);
# X[0..3] += sigma0(X[1..4])
&pshufd
(
$t3
,
@X
[
3
],
0b11111010
);
# X[14..15]
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vprotd
(
$t1
,
$t3
,
15
-
13
);
&vpxor
(
$t3
,
$t3
,
$t2
);
&pslld
(
$t1
,
32
-
18
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t0
,
$t2
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&psrld
(
$t2
,
18
-
7
);
eval
(
shift
(
@insns
));
&vpxor
(
$t3
,
$t3
,
$t1
);
# sigma1(X[14..15])
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t0
,
$t1
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&pslld
(
$t1
,
18
-
7
);
eval
(
shift
(
@insns
));
&vpsrldq
(
$t3
,
$t3
,
8
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t0
,
$t2
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&movdqa
(
$t2
,
$t3
);
eval
(
shift
(
@insns
));
&vpaddd
(
@X
[
0
],
@X
[
0
],
$t3
);
# X[0..1] += sigma1(X[14..15])
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t0
,
$t1
);
# sigma0(X[1..4])
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&psrld
(
$t3
,
10
);
eval
(
shift
(
@insns
));
&vprotd
(
$t3
,
@X
[
0
],
13
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&paddd
(
@X
[
0
],
$t0
);
# X[0..3] += sigma0(X[1..4])
eval
(
shift
(
@insns
));
&vpsrld
(
$t2
,
@X
[
0
],
10
);
eval
(
shift
(
@insns
));
&psrlq
(
$t2
,
17
);
eval
(
shift
(
@insns
));
&vprotd
(
$t1
,
$t3
,
15
-
13
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t3
,
$t2
);
eval
(
shift
(
@insns
));
&vpxor
(
$t3
,
$t3
,
$t2
);
eval
(
shift
(
@insns
));
&psrlq
(
$t2
,
19
-
17
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t3
,
$t2
);
eval
(
shift
(
@insns
));
&vpxor
(
$t3
,
$t3
,
$t1
);
# sigma1(X[16..17])
eval
(
shift
(
@insns
));
&pshufd
(
$t3
,
$t3
,
0b10000000
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&vpslldq
(
$t3
,
$t3
,
8
);
# 22 instructions
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&vpaddd
(
@X
[
0
],
@X
[
0
],
$t3
);
# X[2..3] += sigma1(X[16..17])
&psrldq
(
$t3
,
8
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&paddd
(
@X
[
0
],
$t3
);
# X[0..1] += sigma1(X[14..15])
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&vpaddd
(
$t2
,
@X
[
0
],
&QWP
(
16
*$j
,
$K256
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
&pshufd
(
$t3
,
@X
[
0
],
0b01010000
);
# X[16..17]
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&movdqa
(
$t2
,
$t3
);
eval
(
shift
(
@insns
));
# @
&psrld
(
$t3
,
10
);
eval
(
shift
(
@insns
));
&psrlq
(
$t2
,
17
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t3
,
$t2
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&psrlq
(
$t2
,
19
-
17
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&pxor
(
$t3
,
$t2
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&pshufd
(
$t3
,
$t3
,
0b00001000
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&movdqa
(
$t2
,
&QWP
(
16
*$j
,
$K256
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&pslldq
(
$t3
,
8
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# @
&paddd
(
@X
[
0
],
$t3
);
# X[2..3] += sigma1(X[16..17])
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&paddd
(
$t2
,
@X
[
0
]);
eval
(
shift
(
@insns
));
# @
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&
vmovdqa
(
&QWP
(
32
+
16
*$j
,"
esp
"),
$t2
);
&
movdqa
(
&QWP
(
32
+
16
*$j
,"
esp
"),
$t2
);
}
sub
body_00_15
()
{
(
'
&mov ("ecx",$E);
',
'
&mov ("esi",&off($f));
',
'
&ror ($E,25-11);
',
'
&mov ("e
di",&off($g
));
',
'
&mov ("e
si",&off($f
));
',
'
&xor ($E,"ecx");
',
'
&mov ("edi",&off($g));
',
'
&xor ("esi","edi");
',
'
&ror ($E,11-6);
',
'
&and ("esi","ecx");
',
...
...
@@ -637,19 +730,19 @@ sub body_00_15 () {
'
&xor ("edi","esi");
',
# Ch(e,f,g)
'
&ror ($E,6);
',
# T = Sigma1(e)
'
&mov ("ecx",$AH[0]);
',
'
&add ($E,"edi");
',
# T += Ch(e,f,g)
'
&mov ("edi",&off($b));
',
'
&mov ("esi",$AH[0]);
',
'
&add ($E,&off($h));
',
# T += h
'
&ror ("ecx",22-13);
',
'
&add ($E,"edi");
',
# T += Ch(e,f,g)
'
&mov ("edi",&off($b));
',
'
&xor ("ecx",$AH[0]);
',
'
&mov (&off($a),$AH[0]);
',
# save $A, modulo-scheduled
'
&xor ("ecx",$AH[0]);
',
'
&xor ($AH[0],"edi");
',
# a ^= b, (b^c) in next round
'
&add ($E,&off($h));
',
# T += h
'
&ror ("ecx",13-2);
',
'
&and ($AH[1],$AH[0]);
',
# (b^c) &= (a^b)
'
&add ($E,&DWP(32+4*($i&15),"esp"));
',
# T += K[i]+X[i]
'
&xor ("ecx","esi");
',
'
&add ($E,&DWP(32+4*($i&15),"esp"));
',
# T += K[i]+X[i]
'
&xor ($AH[1],"edi");
',
# h = Maj(a,b,c) = Ch(a^b,c,b)
'
&ror ("ecx",2);
',
# Sigma0(a)
...
...
@@ -662,11 +755,11 @@ sub body_00_15 () {
}
for
(
$i
=
0
,
$j
=
0
;
$j
<
4
;
$j
++
)
{
&
XOP
_00_47
(
$j
,
\
&body_00_15
,
@X
);
&
SSSE3
_00_47
(
$j
,
\
&body_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&cmp
(
&DWP
(
16
*$j
,
$K256
),
0x00010203
);
&jne
(
&label
("
xop
_00_47
"));
&jne
(
&label
("
ssse3
_00_47
"));
for
(
$i
=
0
;
$i
<
16
;
)
{
foreach
(
body_00_15
())
{
eval
;
}
...
...
@@ -708,16 +801,21 @@ sub body_00_15 () {
&mov
(
&DWP
(
28
,"
esp
"),"
edi
");
&mov
("
edi
",
&DWP
(
96
+
4
,"
esp
"));
# inp
&
v
movdqa
(
$t3
,
&QWP
(
64
,
$K256
));
&movdqa
(
$t3
,
&QWP
(
64
,
$K256
));
&sub
(
$K256
,
3
*
64
);
# rewind K
&cmp
("
edi
",
&DWP
(
96
+
8
,"
esp
"));
# are we done yet?
&jb
(
&label
("
grand_
xop
"));
&jb
(
&label
("
grand_
ssse3
"));
&mov
("
esp
",
&DWP
(
96
+
12
,"
esp
"));
# restore sp
&vzeroall
();
&function_end_A
();
&set_label
("
AVX
",
16
);
if
(
$avx
)
{
&set_label
("
AVX
",
32
);
if
(
$avx
>
1
)
{
&mov
("
edx
",
&DWP
(
8
,"
edx
"));
&and
("
edx
",
1
<<
8
|
1
<<
3
);
# check for BMI2+BMI1
&cmp
("
edx
",
1
<<
8
|
1
<<
3
);
&je
(
&label
("
AVX_BMI
"));
}
&lea
("
esp
",
&DWP
(
-
96
,"
esp
"));
&vzeroall
();
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
...
...
@@ -742,7 +840,7 @@ sub body_00_15 () {
&vmovdqa
(
$t3
,
&QWP
(
256
,
$K256
));
&jmp
(
&label
("
grand_avx
"));
&set_label
("
grand_avx
",
16
);
&set_label
("
grand_avx
",
32
);
# load input, reverse byte order, add K256[0..15], save to stack
&vmovdqu
(
@X
[
0
],
&QWP
(
0
,"
edi
"));
&vmovdqu
(
@X
[
1
],
&QWP
(
16
,"
edi
"));
...
...
@@ -809,12 +907,14 @@ my $j = shift;
my
$body
=
shift
;
my
@X
=
@_
;
my
@insns
=
(
&$body
,
&$body
,
&$body
,
&$body
);
# 120 instructions
my
$insn
;
foreach
(
Xupdate_AVX
())
{
# 31 instructions
eval
;
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
$insn
=
shift
(
@insns
));
eval
(
shift
(
@insns
))
if
(
$insn
=~
/rorx/
&&
@insns
[
0
]
=~
/rorx/
);
}
&vpaddd
(
$t2
,
@X
[
0
],
&QWP
(
16
*$j
,
$K256
));
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
...
...
@@ -876,9 +976,150 @@ my @insns = (&$body,&$body,&$body,&$body); # 120 instructions
&mov
("
esp
",
&DWP
(
96
+
12
,"
esp
"));
# restore sp
&vzeroall
();
&function_end_A
();
}}}
if
(
$avx
>
1
)
{
sub
bodyx_00_15
()
{
# +10%
(
'
&rorx ("ecx",$E,6)
',
'
&rorx ("esi",$E,11)
',
'
&mov (&off($e),$E)
',
# save $E, modulo-scheduled
'
&rorx ("edi",$E,25)
',
'
&xor ("ecx","esi")
',
'
&andn ("esi",$E,&off($g))
',
'
&xor ("ecx","edi")
',
# Sigma1(e)
'
&and ($E,&off($f))
',
'
&mov (&off($a),$AH[0]);
',
# save $A, modulo-scheduled
'
&or ($E,"esi")
',
# T = Ch(e,f,g)
'
&rorx ("edi",$AH[0],2)
',
'
&rorx ("esi",$AH[0],13)
',
'
&lea ($E,&DWP(0,$E,"ecx"))
',
# T += Sigma1(e)
'
&rorx ("ecx",$AH[0],22)
',
'
&xor ("esi","edi")
',
'
&mov ("edi",&off($b))
',
'
&xor ("ecx","esi")
',
# Sigma0(a)
'
&xor ($AH[0],"edi")
',
# a ^= b, (b^c) in next round
'
&add ($E,&off($h))
',
# T += h
'
&and ($AH[1],$AH[0])
',
# (b^c) &= (a^b)
'
&add ($E,&DWP(32+4*($i&15),"esp"))
',
# T += K[i]+X[i]
'
&xor ($AH[1],"edi")
',
# h = Maj(a,b,c) = Ch(a^b,c,b)
'
&add ("ecx",$E)
',
# h += T
'
&add ($E,&off($d))
',
# d += T
'
&lea ($AH[1],&DWP(0,$AH[1],"ecx"));
'
.
# h += Sigma0(a)
'
@AH = reverse(@AH); $i++;
'
# rotate(a,h)
);
}
&set_label
("
AVX_BMI
",
32
);
&lea
("
esp
",
&DWP
(
-
96
,"
esp
"));
&vzeroall
();
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&mov
(
$AH
[
0
],
&DWP
(
0
,"
esi
"));
&mov
(
$AH
[
1
],
&DWP
(
4
,"
esi
"));
&mov
("
ecx
",
&DWP
(
8
,"
esi
"));
&mov
("
edi
",
&DWP
(
12
,"
esi
"));
#&mov (&DWP(0,"esp"),$AH[0]);
&mov
(
&DWP
(
4
,"
esp
"),
$AH
[
1
]);
&xor
(
$AH
[
1
],"
ecx
");
# magic
&mov
(
&DWP
(
8
,"
esp
"),"
ecx
");
&mov
(
&DWP
(
12
,"
esp
"),"
edi
");
&mov
(
$E
,
&DWP
(
16
,"
esi
"));
&mov
("
edi
",
&DWP
(
20
,"
esi
"));
&mov
("
ecx
",
&DWP
(
24
,"
esi
"));
&mov
("
esi
",
&DWP
(
28
,"
esi
"));
#&mov (&DWP(16,"esp"),$E);
&mov
(
&DWP
(
20
,"
esp
"),"
edi
");
&mov
("
edi
",
&DWP
(
96
+
4
,"
esp
"));
# inp
&mov
(
&DWP
(
24
,"
esp
"),"
ecx
");
&mov
(
&DWP
(
28
,"
esp
"),"
esi
");
&vmovdqa
(
$t3
,
&QWP
(
256
,
$K256
));
&jmp
(
&label
("
grand_avx_bmi
"));
&set_label
("
grand_avx_bmi
",
32
);
# load input, reverse byte order, add K256[0..15], save to stack
&vmovdqu
(
@X
[
0
],
&QWP
(
0
,"
edi
"));
&vmovdqu
(
@X
[
1
],
&QWP
(
16
,"
edi
"));
&vmovdqu
(
@X
[
2
],
&QWP
(
32
,"
edi
"));
&vmovdqu
(
@X
[
3
],
&QWP
(
48
,"
edi
"));
&add
("
edi
",
64
);
&vpshufb
(
@X
[
0
],
@X
[
0
],
$t3
);
&mov
(
&DWP
(
96
+
4
,"
esp
"),"
edi
");
&vpshufb
(
@X
[
1
],
@X
[
1
],
$t3
);
&vpshufb
(
@X
[
2
],
@X
[
2
],
$t3
);
&vpaddd
(
$t0
,
@X
[
0
],
&QWP
(
0
,
$K256
));
&vpshufb
(
@X
[
3
],
@X
[
3
],
$t3
);
&vpaddd
(
$t1
,
@X
[
1
],
&QWP
(
16
,
$K256
));
&vpaddd
(
$t2
,
@X
[
2
],
&QWP
(
32
,
$K256
));
&vpaddd
(
$t3
,
@X
[
3
],
&QWP
(
48
,
$K256
));
&vmovdqa
(
&QWP
(
32
+
0
,"
esp
"),
$t0
);
&vmovdqa
(
&QWP
(
32
+
16
,"
esp
"),
$t1
);
&vmovdqa
(
&QWP
(
32
+
32
,"
esp
"),
$t2
);
&vmovdqa
(
&QWP
(
32
+
48
,"
esp
"),
$t3
);
&jmp
(
&label
("
avx_bmi_00_47
"));
&set_label
("
avx_bmi_00_47
",
16
);
&add
(
$K256
,
64
);
for
(
$i
=
0
,
$j
=
0
;
$j
<
4
;
$j
++
)
{
&AVX_00_47
(
$j
,
\
&bodyx_00_15
,
@X
);
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&cmp
(
&DWP
(
16
*$j
,
$K256
),
0x00010203
);
&jne
(
&label
("
avx_bmi_00_47
"));
for
(
$i
=
0
;
$i
<
16
;
)
{
foreach
(
bodyx_00_15
())
{
eval
;
}
}
&mov
("
esi
",
&DWP
(
96
,"
esp
"));
#ctx
#&mov ($AH[0],&DWP(0,"esp"));
&xor
(
$AH
[
1
],"
edi
");
#&mov ($AH[1],&DWP(4,"esp"));
#&mov ("edi", &DWP(8,"esp"));
&mov
("
ecx
",
&DWP
(
12
,"
esp
"));
&add
(
$AH
[
0
],
&DWP
(
0
,"
esi
"));
&add
(
$AH
[
1
],
&DWP
(
4
,"
esi
"));
&add
("
edi
",
&DWP
(
8
,"
esi
"));
&add
("
ecx
",
&DWP
(
12
,"
esi
"));
&mov
(
&DWP
(
0
,"
esi
"),
$AH
[
0
]);
&mov
(
&DWP
(
4
,"
esi
"),
$AH
[
1
]);
&mov
(
&DWP
(
8
,"
esi
"),"
edi
");
&mov
(
&DWP
(
12
,"
esi
"),"
ecx
");
#&mov (&DWP(0,"esp"),$AH[0]);
&mov
(
&DWP
(
4
,"
esp
"),
$AH
[
1
]);
&xor
(
$AH
[
1
],"
edi
");
# magic
&mov
(
&DWP
(
8
,"
esp
"),"
edi
");
&mov
(
&DWP
(
12
,"
esp
"),"
ecx
");
#&mov ($E,&DWP(16,"esp"));
&mov
("
edi
",
&DWP
(
20
,"
esp
"));
&mov
("
ecx
",
&DWP
(
24
,"
esp
"));
&add
(
$E
,
&DWP
(
16
,"
esi
"));
&add
("
edi
",
&DWP
(
20
,"
esi
"));
&add
("
ecx
",
&DWP
(
24
,"
esi
"));
&mov
(
&DWP
(
16
,"
esi
"),
$E
);
&mov
(
&DWP
(
20
,"
esi
"),"
edi
");
&mov
(
&DWP
(
20
,"
esp
"),"
edi
");
&mov
("
edi
",
&DWP
(
28
,"
esp
"));
&mov
(
&DWP
(
24
,"
esi
"),"
ecx
");
#&mov (&DWP(16,"esp"),$E);
&add
("
edi
",
&DWP
(
28
,"
esi
"));
&mov
(
&DWP
(
24
,"
esp
"),"
ecx
");
&mov
(
&DWP
(
28
,"
esi
"),"
edi
");
&mov
(
&DWP
(
28
,"
esp
"),"
edi
");
&mov
("
edi
",
&DWP
(
96
+
4
,"
esp
"));
# inp
&vmovdqa
(
$t3
,
&QWP
(
64
,
$K256
));
&sub
(
$K256
,
3
*
64
);
# rewind K
&cmp
("
edi
",
&DWP
(
96
+
8
,"
esp
"));
# are we done yet?
&jb
(
&label
("
grand_avx_bmi
"));
&mov
("
esp
",
&DWP
(
96
+
12
,"
esp
"));
# restore sp
&vzeroall
();
&function_end_A
();
}
}
}}}
&function_end_B
("
sha256_block_data_order
");
&asciz
("
SHA256 block transform for x86, CRYPTOGAMS by <appro
\@
openssl.org>
");
&asm_finish
();
crypto/sha/asm/sha512-586.pl
浏览文件 @
32213d8d
...
...
@@ -9,18 +9,30 @@
#
# SHA512 block transform for x86. September 2007.
#
# May 2013.
#
# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
#
# Performance in clock cycles per processed byte (less is better):
#
# PIII P4 AMD K8 Core2 SB Atom Bldzr
# gcc 75 116 54 66 58 126 121
# icc 77 95 55 57 - - -
# x86 asm 56 82 36 40 35 68 50
# SSE2 asm - 36.2 20.8 19.2 14.9 60(**) 17.1
# x86_64 asm(*) - 33 9.6 10.3 11.3 14.7 13.5
# gcc icc x86 asm SIMD(*) x86_64(**)
# Pentium 100 97 61 - -
# PIII 75 77 56 - -
# P4 116 95 82 34.6 30.8
# AMD K8 54 55 36 20.7 9.57
# Core2 66 57 40 15.9 9.97
# Westmere 70 - 38 12.2 9.58
# Sandy Bridge 58 - 35 11.9 11.2
# Ivy Bridge 50 - 33 11.5 8.17
# Haswell 46 - 29 11.3 7.66
# Bulldozer 121 - 50 14.0 13.5
# VIA Nano 91 - 52 33 14.7
# Atom 126 - 68 48(***) 14.7
#
# (*) x86_64 assembler performance is presented for reference
# purposes.
# (**) paddq is increadibly slow on Atom.
# (*) whichever best applicable.
# (**) x86_64 assembler performance is presented for reference
# purposes, the results are for integer-only code.
# (***) paddq is increadibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
...
...
@@ -82,7 +94,7 @@ sub BODY_00_15_sse2 {
&pand
("
mm5
",
$E
);
# f&=e
&psllq
(
$E
,
23
);
# $E is sliding left
&movq
(
$A
,"
mm3
")
if
(
$phase
<
2
);
&movq
(
&QWP
(
8
*
9
,"
esp
"),"
mm7
")
if
(
$phase
>
1
);
# save X[i]
&movq
(
&QWP
(
8
*
9
,"
esp
"),"
mm7
")
# save X[i]
&movq
("
mm3
","
mm1
");
# %mm3 is T1
&psrlq
("
mm1
",
4
);
&pxor
("
mm5
","
mm6
");
# Ch(e,f,g)
...
...
@@ -100,11 +112,11 @@ sub BODY_00_15_sse2 {
&pxor
("
mm3
",
$E
);
# T1=Sigma1_512(e)
&movq
(
$E
,
$Dsse2
);
# e = load d, e in next round
&movq
("
mm5
",
$A
);
# %mm5 is sliding right
&paddq
("
mm3
","
mm7
");
# T1+=X[i]
&movq
("
mm5
",
$A
);
# %mm5 is sliding right
&psrlq
("
mm5
",
28
);
&movq
("
mm6
",
$A
);
# %mm6 is sliding left
&paddq
(
$E
,"
mm3
");
# d += T1
&movq
("
mm6
",
$A
);
# %mm6 is sliding left
&movq
("
mm7
","
mm5
");
&psllq
("
mm6
",
25
);
&movq
("
mm1
",
$Bsse2
);
# load b
...
...
@@ -290,19 +302,28 @@ sub BODY_00_15_x86 {
if
(
$sse2
)
{
&picmeup
("
edx
","
OPENSSL_ia32cap_P
",
$K512
,
&label
("
K512
"));
&bt
(
&DWP
(
0
,"
edx
"),
26
);
&jnc
(
&label
("
loop_x86
"));
&mov
("
ecx
",
&DWP
(
0
,"
edx
"));
&test
("
ecx
",
1
<<
26
);
&jz
(
&label
("
loop_x86
"));
&mov
("
edx
",
&DWP
(
4
,"
edx
"));
# load ctx->h[0-7]
&movq
(
$A
,
&QWP
(
0
,"
esi
"));
&and
("
ecx
",
1
<<
24
);
# XMM registers availability
&movq
("
mm1
",
&QWP
(
8
,"
esi
"));
&and
("
edx
",
1
<<
9
);
# SSSE3 bit
&movq
(
$BxC
,
&QWP
(
16
,"
esi
"));
&or
("
ecx
","
edx
");
&movq
("
mm3
",
&QWP
(
24
,"
esi
"));
&movq
(
$E
,
&QWP
(
32
,"
esi
"));
&movq
("
mm5
",
&QWP
(
40
,"
esi
"));
&movq
("
mm6
",
&QWP
(
48
,"
esi
"));
&movq
("
mm7
",
&QWP
(
56
,"
esi
"));
&cmp
("
ecx
",
1
<<
24
|
1
<<
9
);
&je
(
&label
("
SSSE3
"));
&sub
("
esp
",
8
*
10
);
&jmp
(
&label
("
loop_sse2
"));
&set_label
("
loop_sse2
",
16
);
#&movq ($Asse2,$A);
...
...
@@ -316,32 +337,32 @@ if ($sse2) {
&movq
(
$Hsse2
,"
mm7
");
&movq
("
mm3
",
$A
);
# magic
&mov
("
e
c
x
",
&DWP
(
0
,"
edi
"));
&mov
("
e
d
x
",
&DWP
(
4
,"
edi
"));
&mov
("
e
a
x
",
&DWP
(
0
,"
edi
"));
&mov
("
e
b
x
",
&DWP
(
4
,"
edi
"));
&add
("
edi
",
8
);
&bswap
("
ecx
");
&bswap
("
edx
");
&mov
(
&DWP
(
8
*
9
+
4
,"
esp
"),"
ecx
");
&mov
(
&DWP
(
8
*
9
+
0
,"
esp
"),"
edx
");
&mov
("
edx
",
15
);
# counter
&bswap
("
eax
");
&bswap
("
ebx
");
&jmp
(
&label
("
00_14_sse2
"));
&set_label
("
00_14_sse2
",
16
);
&movd
("
mm1
","
eax
");
&mov
("
eax
",
&DWP
(
0
,"
edi
"));
&movd
("
mm7
","
ebx
");
&mov
("
ebx
",
&DWP
(
4
,"
edi
"));
&add
("
edi
",
8
);
&bswap
("
eax
");
&movq
("
mm7
",
&QWP
(
8
*
9
,"
esp
"));
# X[i]
&bswap
("
ebx
");
&mov
(
&DWP
(
8
*
8
+
4
,"
esp
"),"
eax
");
&mov
(
&DWP
(
8
*
8
+
0
,"
esp
"),"
ebx
");
&punpckldq
("
mm7
","
mm1
");
&BODY_00_15_sse2
();
&dec
("
edx
");
&jnz
(
&label
("
00_14_sse2
"));
&movq
("
mm7
",
&QWP
(
8
*
9
,"
esp
"));
# X[i]
&movd
("
mm1
","
eax
");
&movd
("
mm7
","
ebx
");
&punpckldq
("
mm7
","
mm1
");
&BODY_00_15_sse2
(
1
);
...
...
@@ -429,6 +450,231 @@ if ($sse2) {
&mov
("
esp
",
&DWP
(
8
*
10
+
12
,"
esp
"));
# restore sp
&emms
();
&function_end_A
();
&set_label
("
SSSE3
",
32
);
{
my
(
$cnt
,
$frame
)
=
("
ecx
","
edx
");
my
@X
=
map
("
xmm
$_
",(
0
..
7
));
my
$j
;
my
$i
=
0
;
&lea
(
$frame
,
&DWP
(
-
64
,"
esp
"));
&sub
("
esp
",
256
);
# fixed stack frame layout
#
# +0 A B C D E F G H # backing store
# +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area
# +192 # XMM off-load ring buffer
# +256 # saved parameters
&movdqa
(
@X
[
1
],
&QWP
(
80
*
8
,
$K512
));
# byte swap mask
&movdqu
(
@X
[
0
],
&QWP
(
0
,"
edi
"));
&pshufb
(
@X
[
0
],
@X
[
1
]);
for
(
$j
=
0
;
$j
<
8
;
$j
++
)
{
&movdqa
(
&QWP
(
16
*
((
$j
-
1
)
%
4
),
$frame
),
@X
[
3
])
if
(
$j
>
4
);
# off-load
&movdqa
(
@X
[
3
],
&QWP
(
16
*
(
$j
%
8
),
$K512
));
&movdqa
(
@X
[
2
],
@X
[
1
])
if
(
$j
<
7
);
# perpetuate byte swap mask
&movdqu
(
@X
[
1
],
&QWP
(
16
*
(
$j
+
1
),"
edi
"))
if
(
$j
<
7
);
# next input
&movdqa
(
@X
[
1
],
&QWP
(
16
*
((
$j
+
1
)
%
4
),
$frame
))
if
(
$j
==
7
);
# restore @X[0]
&paddq
(
@X
[
3
],
@X
[
0
]);
&pshufb
(
@X
[
1
],
@X
[
2
])
if
(
$j
<
7
);
&movdqa
(
&QWP
(
16
*
(
$j
%
8
)
-
128
,
$frame
),
@X
[
3
]);
# xfer X[i]+K[i]
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
#&jmp (&label("loop_ssse3"));
&nop
();
&set_label
("
loop_ssse3
",
32
);
&movdqa
(
@X
[
2
],
&QWP
(
16
*
((
$j
+
1
)
%
4
),
$frame
));
# pre-restore @X[1]
&movdqa
(
&QWP
(
16
*
((
$j
-
1
)
%
4
),
$frame
),
@X
[
3
]);
# off-load @X[3]
&lea
(
$K512
,
&DWP
(
16
*
8
,
$K512
));
#&movq ($Asse2,$A); # off-load A-H
&movq
(
$Bsse2
,"
mm1
");
&mov
("
ebx
","
edi
");
&movq
(
$Csse2
,
$BxC
);
&lea
("
edi
",
&DWP
(
128
,"
edi
"));
# advance input
&movq
(
$Dsse2
,"
mm3
");
&cmp
("
edi
","
eax
");
#&movq ($Esse2,$E);
&movq
(
$Fsse2
,"
mm5
");
&cmovb
("
ebx
","
edi
");
&movq
(
$Gsse2
,"
mm6
");
&mov
("
ecx
",
4
);
# loop counter
&pxor
(
$BxC
,"
mm1
");
# magic
&movq
(
$Hsse2
,"
mm7
");
&pxor
("
mm3
","
mm3
");
# magic
&jmp
(
&label
("
00_47_ssse3
"));
sub
BODY_00_15_ssse3
{
# "phase-less" copy of BODY_00_15_sse2
(
'
&movq ("mm1",$E)
',
# %mm1 is sliding right
'
&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))
',
# X[i]+K[i]
'
&pxor ("mm5","mm6")
',
# f^=g
'
&psrlq ("mm1",14)
',
'
&movq (&QWP(8*($i+4)%64,"esp"),$E)
',
# modulo-scheduled save e
'
&pand ("mm5",$E)
',
# f&=e
'
&psllq ($E,23)
',
# $E is sliding left
'
&paddq ($A,"mm3")
',
# [h+=Maj(a,b,c)]
'
&movq ("mm3","mm1")
',
# %mm3 is T1
'
&psrlq("mm1",4)
',
'
&pxor ("mm5","mm6")
',
# Ch(e,f,g)
'
&pxor ("mm3",$E)
',
'
&psllq($E,23)
',
'
&pxor ("mm3","mm1")
',
'
&movq (&QWP(8*$i%64,"esp"),$A)
',
# modulo-scheduled save a
'
&paddq("mm7","mm5")
',
# X[i]+=Ch(e,f,g)
'
&pxor ("mm3",$E)
',
'
&psrlq("mm1",23)
',
'
&paddq("mm7",&QWP(8*($i+7)%64,"esp"))
',
# X[i]+=h
'
&pxor ("mm3","mm1")
',
'
&psllq($E,4)
',
'
&pxor ("mm3",$E)
',
# T1=Sigma1_512(e)
'
&movq ($E,&DWP(8*($i+3)%64,"esp"))
',
# e = load d, e in next round
'
&paddq ("mm3","mm7")
',
# T1+=X[i]
'
&movq ("mm5",$A)
',
# %mm5 is sliding right
'
&psrlq("mm5",28)
',
'
&paddq ($E,"mm3")
',
# d += T1
'
&movq ("mm6",$A)
',
# %mm6 is sliding left
'
&movq ("mm7","mm5")
',
'
&psllq("mm6",25)
',
'
&movq ("mm1",&QWP(8*($i+1)%64,"esp"))
',
# load b
'
&psrlq("mm5",6)
',
'
&pxor ("mm7","mm6")
',
'
&psllq("mm6",5)
',
'
&pxor ("mm7","mm5")
',
'
&pxor ($A,"mm1")
',
# a^b, b^c in next round
'
&psrlq("mm5",5)
',
'
&pxor ("mm7","mm6")
',
'
&pand ($BxC,$A)
',
# (b^c)&(a^b)
'
&psllq("mm6",6)
',
'
&pxor ("mm7","mm5")
',
'
&pxor ($BxC,"mm1")
',
# [h=]Maj(a,b,c)
'
&pxor ("mm6","mm7")
',
# Sigma0_512(a)
'
&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))
',
# pre-load f
'
&paddq ($BxC,"mm6")
',
# h+=Sigma0(a)
'
&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))
',
# pre-load g
'
($A,$BxC) = ($BxC,$A); $i--;
'
);
}
&set_label
("
00_47_ssse3
",
32
);
for
(;
$j
<
16
;
$j
++
)
{
my
(
$t0
,
$t2
,
$t1
)
=
@X
[
2
..
4
];
my
@insns
=
(
&BODY_00_15_ssse3
(),
&BODY_00_15_ssse3
());
&movdqa
(
$t2
,
@X
[
5
]);
&movdqa
(
@X
[
1
],
$t0
);
# restore @X[1]
&palignr
(
$t0
,
@X
[
0
],
8
);
# X[1..2]
&movdqa
(
&QWP
(
16
*
(
$j
%
4
),
$frame
),
@X
[
4
]);
# off-load @X[4]
&palignr
(
$t2
,
@X
[
4
],
8
);
# X[9..10]
&movdqa
(
$t1
,
$t0
);
&psrlq
(
$t0
,
7
);
&paddq
(
@X
[
0
],
$t2
);
# X[0..1] += X[9..10]
&movdqa
(
$t2
,
$t1
);
&psrlq
(
$t1
,
1
);
&psllq
(
$t2
,
64
-
8
);
&pxor
(
$t0
,
$t1
);
&psrlq
(
$t1
,
8
-
1
);
&pxor
(
$t0
,
$t2
);
&psllq
(
$t2
,
8
-
1
);
&pxor
(
$t0
,
$t1
);
&movdqa
(
$t1
,
@X
[
7
]);
&pxor
(
$t0
,
$t2
);
# sigma0(X[1..2])
&movdqa
(
$t2
,
@X
[
7
]);
&psrlq
(
$t1
,
6
);
&paddq
(
@X
[
0
],
$t0
);
# X[0..1] += sigma0(X[1..2])
&movdqa
(
$t0
,
@X
[
7
]);
&psrlq
(
$t2
,
19
);
&psllq
(
$t0
,
64
-
61
);
&pxor
(
$t1
,
$t2
);
&psrlq
(
$t2
,
61
-
19
);
&pxor
(
$t1
,
$t0
);
&psllq
(
$t0
,
61
-
19
);
&pxor
(
$t1
,
$t2
);
&movdqa
(
$t2
,
&QWP
(
16
*
((
$j
+
2
)
%
4
),
$frame
));
# pre-restore @X[1]
&pxor
(
$t1
,
$t0
);
# sigma0(X[1..2])
&movdqa
(
$t0
,
&QWP
(
16
*
(
$j
%
8
),
$K512
));
eval
(
shift
(
@insns
));
&paddq
(
@X
[
0
],
$t1
);
# X[0..1] += sigma0(X[14..15])
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&paddq
(
$t0
,
@X
[
0
]);
foreach
(
@insns
)
{
eval
;
}
&movdqa
(
&QWP
(
16
*
(
$j
%
8
)
-
128
,
$frame
),
$t0
);
# xfer X[i]+K[i]
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
&lea
(
$K512
,
&DWP
(
16
*
8
,
$K512
));
&dec
("
ecx
");
&jnz
(
&label
("
00_47_ssse3
"));
&movdqa
(
@X
[
1
],
&QWP
(
0
,
$K512
));
# byte swap mask
&lea
(
$K512
,
&DWP
(
-
80
*
8
,
$K512
));
# rewind
&movdqu
(
@X
[
0
],
&QWP
(
0
,"
ebx
"));
&pshufb
(
@X
[
0
],
@X
[
1
]);
for
(
$j
=
0
;
$j
<
8
;
$j
++
)
{
# load next or same block
my
@insns
=
(
&BODY_00_15_ssse3
(),
&BODY_00_15_ssse3
());
&movdqa
(
&QWP
(
16
*
((
$j
-
1
)
%
4
),
$frame
),
@X
[
3
])
if
(
$j
>
4
);
# off-load
&movdqa
(
@X
[
3
],
&QWP
(
16
*
(
$j
%
8
),
$K512
));
&movdqa
(
@X
[
2
],
@X
[
1
])
if
(
$j
<
7
);
# perpetuate byte swap mask
&movdqu
(
@X
[
1
],
&QWP
(
16
*
(
$j
+
1
),"
ebx
"))
if
(
$j
<
7
);
# next input
&movdqa
(
@X
[
1
],
&QWP
(
16
*
((
$j
+
1
)
%
4
),
$frame
))
if
(
$j
==
7
);
# restore @X[0]
&paddq
(
@X
[
3
],
@X
[
0
]);
&pshufb
(
@X
[
1
],
@X
[
2
])
if
(
$j
<
7
);
foreach
(
@insns
)
{
eval
;
}
&movdqa
(
&QWP
(
16
*
(
$j
%
8
)
-
128
,
$frame
),
@X
[
3
]);
# xfer X[i]+K[i]
push
(
@X
,
shift
(
@X
));
# rotate(@X)
}
#&movq ($A,$Asse2); # load A-H
&movq
("
mm1
",
$Bsse2
);
&paddq
(
$A
,"
mm3
");
# from BODY_00_15
#&movq ($BxC,$Csse2);
&movq
("
mm3
",
$Dsse2
);
#&movq ($E,$Esse2);
#&movq ("mm5",$Fsse2);
#&movq ("mm6",$Gsse2);
&movq
("
mm7
",
$Hsse2
);
&pxor
(
$BxC
,"
mm1
");
# de-magic
&paddq
(
$A
,
&QWP
(
0
,"
esi
"));
&paddq
("
mm1
",
&QWP
(
8
,"
esi
"));
&paddq
(
$BxC
,
&QWP
(
16
,"
esi
"));
&paddq
("
mm3
",
&QWP
(
24
,"
esi
"));
&paddq
(
$E
,
&QWP
(
32
,"
esi
"));
&paddq
("
mm5
",
&QWP
(
40
,"
esi
"));
&paddq
("
mm6
",
&QWP
(
48
,"
esi
"));
&paddq
("
mm7
",
&QWP
(
56
,"
esi
"));
&movq
(
&QWP
(
0
,"
esi
"),
$A
);
&movq
(
&QWP
(
8
,"
esi
"),"
mm1
");
&movq
(
&QWP
(
16
,"
esi
"),
$BxC
);
&movq
(
&QWP
(
24
,"
esi
"),"
mm3
");
&movq
(
&QWP
(
32
,"
esi
"),
$E
);
&movq
(
&QWP
(
40
,"
esi
"),"
mm5
");
&movq
(
&QWP
(
48
,"
esi
"),"
mm6
");
&movq
(
&QWP
(
56
,"
esi
"),"
mm7
");
&cmp
("
edi
","
eax
")
# are we done yet?
&jb
(
&label
("
loop_ssse3
"));
&mov
("
esp
",
&DWP
(
64
+
12
,
$frame
));
# restore sp
&emms
();
}
&function_end_A
();
}
&set_label
("
loop_x86
",
16
);
# copy input block to stack reversing byte and qword order
...
...
@@ -655,6 +901,9 @@ if ($sse2) {
&data_word
(
0xfc657e2a
,
0x597f299c
);
# u64
&data_word
(
0x3ad6faec
,
0x5fcb6fab
);
# u64
&data_word
(
0x4a475817
,
0x6c44198c
);
# u64
&data_word
(
0x04050607
,
0x00010203
);
# byte swap
&data_word
(
0x0c0d0e0f
,
0x08090a0b
);
# mask
&function_end_B
("
sha512_block_data_order
");
&asciz
("
SHA512 block transform for x86, CRYPTOGAMS by <appro
\@
openssl.org>
");
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录