Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
btwise
openssl
提交
3a9b3852
O
openssl
项目概览
btwise
/
openssl
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
3a9b3852
编写于
6月 12, 2012
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
sha256-586.pl: squeeze some more, most notably ~10% on Nehalem.
上级
d2e18031
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
71 addition
and
62 deletion
+71
-62
crypto/sha/asm/sha256-586.pl
crypto/sha/asm/sha256-586.pl
+67
-58
crypto/sha/asm/sha512-x86_64.pl
crypto/sha/asm/sha512-x86_64.pl
+4
-4
未找到文件。
crypto/sha/asm/sha256-586.pl
浏览文件 @
3a9b3852
...
...
@@ -17,18 +17,18 @@
#
# Optimization including two of Pavel Semjanov's ideas, alternative
# Maj and full unroll, resulted in ~20-25% improvement on most CPUs,
# ~
10% on Pentium and P4, ~37% on Atom. As fully unrolled loop body is
#
almost 15x larger, 8KB vs. 560B, it's fired only for longer inputs.
#
But not on P4, where it kills performance, nor Sandy Bridge, where
#
folded loop is just
as fast...
# ~
7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost
#
15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not
#
on P4, where it kills performance, nor Sandy Bridge, where folded
#
loop is approximately
as fast...
#
# Performance in clock cycles per processed byte (less is better):
#
# P
entium PIII P4 AMD K8 Core2 SB(**) Atom
# gcc
46 36 41 27 26 25 50
# icc
57 33 38 25 23
- -
# x86 asm(*)
39/36 27/24 30 19/15.5 18/16 16(**) 30/26
# x86_64 asm(***)
- 17.5 15 16 17.5 23
# P
III P4 AMD K8 Core2 SB(**) Atom Bldzr
# gcc
36 41 27 26 25 50 36
# icc
33 38 25 23 -
- -
# x86 asm(*)
27/24 28 19/15.5 18/15.6 16(**) 30/25 27/22
# x86_64 asm(***)
17.5 15 15.5 17.5 23 21
#
# (*) numbers after slash are for unrolled loop, where available;
# (**) for Sandy Bridge executing code path with ror replaced with
...
...
@@ -42,7 +42,13 @@ require "x86asm.pl";
&asm_init
(
$ARGV
[
0
],"
sha512-586.pl
",
$ARGV
[
$#ARGV
]
eq
"
386
");
$unroll_after
=
1024
;
$unroll_after
=
64
*
4
;
# If pre-evicted from L1P cache first spin of
# fully unrolled loop was measured to run about
# 3-4x slower. If slowdown coefficient is N and
# unrolled loop is m times faster, then you break
# even at (N-1)/(m-1) blocks. Then it needs to be
# adjusted for probability of code being evicted,
# code size/cache size=1/4. Typical m is 1.15...
$A
=
"
eax
";
$E
=
"
edx
";
...
...
@@ -65,9 +71,9 @@ sub BODY_16_63() {
&mov
("
edi
","
esi
");
&ror
("
esi
",
19
-
17
);
&xor
("
ecx
",
$T
);
&shr
(
$T
,
3
);
&xor
("
esi
","
edi
");
&shr
(
$T
,
3
);
&ror
("
ecx
",
7
);
&xor
("
esi
","
edi
");
&xor
(
$T
,"
ecx
");
# T = sigma0(X[-15])
&ror
("
esi
",
17
);
&add
(
$T
,
&DWP
(
4
*
(
9
+
15
+
16
),"
esp
"));
# T += X[-16]
...
...
@@ -96,8 +102,8 @@ sub BODY_00_15() {
&and
("
esi
",
$E
);
&mov
(
$Eoff
,
$E
);
# modulo-scheduled
&xor
(
$E
,"
ecx
");
&xor
("
esi
","
edi
");
# Ch(e,f,g)
&add
(
$T
,
$Hoff
);
# T += h
&xor
("
esi
","
edi
");
# Ch(e,f,g)
&ror
(
$E
,
6
);
# Sigma1(e)
&mov
("
ecx
",
$A
);
&add
(
$T
,"
esi
");
# T += Ch(e,f,g)
...
...
@@ -162,7 +168,7 @@ sub BODY_00_15() {
if
(
$unroll_after
)
{
&sub
("
eax
","
edi
");
&cmp
("
eax
",
$unroll_after
);
&j
g
e
(
&label
("
unrolled
"));
&j
a
e
(
&label
("
unrolled
"));
}
}
&jmp
(
&label
("
loop
"));
...
...
@@ -288,6 +294,7 @@ my $suffix=shift;
0x748f82ee
,
0x78a5636f
,
0x84c87814
,
0x8cc70208
,
0x90befffa
,
0xa4506ceb
,
0xbef9a3f7
,
0xc67178f2
);
&data_word
(
@K256
);
&data_word
(
0x00010203
,
0x04050607
,
0x08090a0b
,
0x0c0d0e0f
);
if
(
!
$i386
&&
$unroll_after
)
{
my
@AH
=
(
$A
,
$K256
);
...
...
@@ -333,70 +340,72 @@ my @AH=($A,$K256);
&mov
(
&DWP
(
96
+
4
,"
esp
"),"
edi
");
&mov
(
&DWP
(
32
+
12
*$i
,"
esp
"),"
ebx
");
my
(
$t1
,
$t2
)
=
("
ecx
","
esi
");
my
(
$a
,
$b
,
$c
,
$d
,
$e
,
$f
,
$g
,
$h
)
=
(
0
..
7
);
# offsets
sub
off
{
&DWP
(
4
*
(((
shift
)
-
$i
)
&
7
),"
esp
");
}
for
(
$i
=
0
;
$i
<
64
;
$i
++
)
{
if
(
$i
>=
16
)
{
&mov
(
$T
,
"
ecx
");
# "ecx"
is preloaded
# &mov (
"esi"
,&DWP(32+4*(($i+14)&15),"esp"));
&ror
(
"
ecx
"
,
18
-
7
);
&mov
("
edi
",
"
esi
"
);
&ror
(
"
esi
"
,
19
-
17
);
&xor
(
"
ecx
"
,
$T
);
&shr
(
$T
,
3
);
&xor
("
esi
","
edi
"
);
&ror
("
ecx
",
7
);
&xor
(
$T
,
"
ecx
"
);
# T = sigma0(X[-15])
&ror
(
"
esi
"
,
17
);
&mov
(
$T
,
$t1
);
# $t1
is preloaded
# &mov (
$t2
,&DWP(32+4*(($i+14)&15),"esp"));
&ror
(
$t1
,
18
-
7
);
&mov
("
edi
",
$t2
);
&ror
(
$t2
,
19
-
17
);
&xor
(
$t1
,
$T
);
&shr
(
$T
,
3
);
&ror
(
$t1
,
7
);
&xor
(
$t2
,"
edi
"
);
&xor
(
$T
,
$t1
);
# T = sigma0(X[-15])
&ror
(
$t2
,
17
);
&add
(
$T
,
&DWP
(
32
+
4
*
(
$i
&
15
),"
esp
"));
# T += X[-16]
&shr
("
edi
",
10
);
&add
(
$T
,
&DWP
(
32
+
4
*
((
$i
+
9
)
&
15
),"
esp
"));
# T += X[-7]
#&xor ("edi",
"esi"
) # sigma1(X[-2])
#&xor ("edi",
$t2
) # sigma1(X[-2])
# &add ($T,"edi"); # T += sigma1(X[-2])
# &mov (&DWP(4*(9+15),"esp"),$T); # save X[0]
}
&mov
(
"
ecx
"
,
$E
);
&xor
("
edi
",
"
esi
"
)
if
(
$i
>=
16
);
# sigma1(X[-2])
&mov
(
"
esi
"
,
&off
(
$f
));
&ror
(
"
ecx
"
,
25
-
11
);
&mov
(
$t1
,
$E
);
&xor
("
edi
",
$t2
)
if
(
$i
>=
16
);
# sigma1(X[-2])
&mov
(
$t2
,
&off
(
$f
));
&ror
(
$E
,
25
-
11
);
&add
(
$T
,"
edi
")
if
(
$i
>=
16
);
# T += sigma1(X[-2])
&mov
("
edi
",
&off
(
$g
));
&xor
("
ecx
",
$E
);
&xor
("
esi
","
edi
");
&xor
(
$E
,
$t1
);
&mov
(
$T
,
&DWP
(
32
+
4
*
(
$i
&
15
),"
esp
"))
if
(
$i
<
16
);
# X[i]
&mov
(
&DWP
(
32
+
4
*
(
$i
&
15
),"
esp
"),
$T
)
if
(
$i
>=
16
);
# save X[0]
&ror
("
ecx
",
11
-
6
);
&and
("
esi
",
$E
);
&
mov
(
&off
(
$e
),
$E
);
# modulo-scheduled
&xor
(
$E
,"
ecx
");
&xor
("
esi
","
edi
");
# Ch(e,f,g)
&mov
(
&DWP
(
32
+
4
*
(
$i
&
15
),"
esp
"),
$T
)
if
(
$i
>=
16
&&
$i
<
62
);
# save X[0]
&xor
(
$t2
,"
edi
"
);
&ror
(
$E
,
11
-
6
);
&
and
(
$t2
,
$t1
);
&mov
(
&off
(
$e
),
$t1
);
# save $E, modulo-scheduled
&xor
(
$E
,
$t1
);
&add
(
$T
,
&off
(
$h
));
# T += h
&xor
("
edi
",
$t2
);
# Ch(e,f,g)
&ror
(
$E
,
6
);
# Sigma1(e)
&mov
(
"
ecx
"
,
$AH
[
0
]);
&add
(
$T
,"
e
s
i
");
# T += Ch(e,f,g)
&mov
(
$t1
,
$AH
[
0
]);
&add
(
$T
,"
e
d
i
");
# T += Ch(e,f,g)
&ror
("
ecx
",
22
-
13
);
&ror
(
$t1
,
22
-
13
);
&mov
(
$t2
,
$AH
[
0
]);
&mov
("
edi
",
&off
(
$b
));
&xor
("
ecx
",
$AH
[
0
]);
&mov
(
&off
(
$a
),
$AH
[
0
]);
# modulo-scheduled
&ror
("
ecx
",
13
-
2
);
&lea
(
$T
,
&DWP
(
@K256
[
$i
],
$T
,
$E
));
# T += Sigma1(1)+K[i]
&mov
(
$E
,
&off
(
$d
));
# e in next iteration, d in this one
&xor
("
ecx
",
$AH
[
0
]);
&xor
(
$t1
,
$AH
[
0
]);
&mov
(
&off
(
$a
),
$AH
[
0
]);
# save $A, modulo-scheduled
&xor
(
$AH
[
0
],"
edi
");
# a ^= b, (b^c) in next round
&ror
("
ecx
",
2
);
# Sigma0(a)
&add
(
$E
,
$T
);
# d += T
&and
(
$AH
[
1
],
$AH
[
0
]);
# a &= (b^c)
&add
(
$T
,"
ecx
");
# T += Sigma0(a)
&mov
("
ecx
",
&DWP
(
32
+
4
*
((
$i
+
2
)
&
15
),"
esp
"))
if
(
$i
>=
15
&&
$i
<
63
);
&xor
(
$AH
[
1
],"
edi
");
# h = Maj(a,b,c) = Ch(a^b,c,b)
&mov
("
esi
",
&DWP
(
32
+
4
*
((
$i
+
15
)
&
15
),"
esp
"))
if
(
$i
>=
15
&&
$i
<
63
);
&add
(
$AH
[
1
],
$T
);
# h += T
unshift
(
@AH
,
pop
(
@AH
));
# rotate(a,h)
&ror
(
$t1
,
13
-
2
);
&and
(
$AH
[
1
],
$AH
[
0
]);
# (b^c) &= (a^b)
&lea
(
$E
,
&DWP
(
@K256
[
$i
],
$T
,
$E
));
# T += Sigma1(1)+K[i]
&xor
(
$t1
,
$t2
);
&xor
(
$AH
[
1
],"
edi
");
# h = Maj(a,b,c) = Ch(a^b,c,b)
&mov
(
$t2
,
&DWP
(
32
+
4
*
((
$i
+
2
)
&
15
),"
esp
"))
if
(
$i
>=
15
&&
$i
<
63
);
&ror
(
$t1
,
2
);
# Sigma0(a)
&add
(
$AH
[
1
],
$E
);
# h += T
&add
(
$E
,
&off
(
$d
));
# d += T
&add
(
$AH
[
1
],
$t1
);
# h += Sigma0(a)
&mov
(
$t1
,
&DWP
(
32
+
4
*
((
$i
+
15
)
&
15
),"
esp
"))
if
(
$i
>=
15
&&
$i
<
63
);
@AH
=
reverse
(
@AH
);
# rotate(a,h)
(
$t1
,
$t2
)
=
(
$t2
,
$t1
);
# rotate(t1,t2)
}
&mov
("
esi
",
&DWP
(
96
,"
esp
"));
#ctx
#&mov ($AH[0],&DWP(0,"esp"));
...
...
crypto/sha/asm/sha512-x86_64.pl
浏览文件 @
3a9b3852
...
...
@@ -106,20 +106,20 @@ $code.=<<___;
xor $e,$a0
xor $g,$a2 # f^g
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
mov $T1,`$SZ*($i&0xf)`(%rsp)
xor $a,$a1
and $e,$a2 # (f^g)&e
mov $T1,`$SZ*($i&0xf)`(%rsp)
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
add $h,$T1 # T1+=h
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
add $a2,$T1 # T1+=Ch(e,f,g)
xor $e,$a0
add $a2,$T1 # T1+=Ch(e,f,g)
add ($Tbl),$T1 # T1+=K[round]
mov $a,$a2
add ($Tbl),$T1 # T1+=K[round]
xor $a,$a1
ror \$$Sigma1[0],$a0 # Sigma1(e)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录