Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
4cfe3df1
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4cfe3df1
编写于
12月 28, 2006
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Minor performance improvements to x86-mont.pl.
上级
8f2d60ec
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
43 addition
and
26 deletion
+43
-26
crypto/bn/asm/x86-mont.pl
crypto/bn/asm/x86-mont.pl
+43
-26
未找到文件。
crypto/bn/asm/x86-mont.pl
浏览文件 @
4cfe3df1
...
...
@@ -24,7 +24,7 @@
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives
>=3
0% on rsa512 sign benchmark...
# gives
~4
0% on rsa512 sign benchmark...
$
0
=~
m/(.*[\/\\])[^\/\\]+$/
;
$dir
=
$
1
;
push
(
@INC
,"
${dir}
","
${dir}
../../perlasm
");
...
...
@@ -46,12 +46,12 @@ $rp="edi"; $bp="edi"; # overlapping variables!!!
$np
=
"
ebp
";
$num
=
"
ebx
";
$_
rp
=
&DWP
(
4
*
0
,"
esp
");
# stack top layout
$_
a
p
=
&DWP
(
4
*
1
,"
esp
");
$_
b
p
=
&DWP
(
4
*
2
,"
esp
");
$_
n
p
=
&DWP
(
4
*
3
,"
esp
");
$_n
0
=
&DWP
(
4
*
4
,"
esp
");
$_n
um
=
&DWP
(
4
*
5
,"
esp
");
$_
num
=
&DWP
(
4
*
0
,"
esp
");
# stack top layout
$_
r
p
=
&DWP
(
4
*
1
,"
esp
");
$_
a
p
=
&DWP
(
4
*
2
,"
esp
");
$_
b
p
=
&DWP
(
4
*
3
,"
esp
");
$_n
p
=
&DWP
(
4
*
4
,"
esp
");
$_n
0
=
&DWP
(
4
*
5
,"
esp
");
$_sp
=
&DWP
(
4
*
6
,"
esp
");
$_bpend
=
&DWP
(
4
*
7
,"
esp
");
$frame
=
32
;
# size of above frame rounded up to 16n
...
...
@@ -61,20 +61,37 @@ $frame=32; # size of above frame rounded up to 16n
&cmp
("
edi
",
4
);
&jl
(
&label
("
just_leave
"));
################################# load argument block...
&mov
("
eax
",
&wparam
(
0
));
# BN_ULONG *rp
&mov
("
ebx
",
&wparam
(
1
));
# const BN_ULONG *ap
&mov
("
ecx
",
&wparam
(
2
));
# const BN_ULONG *bp
&mov
("
edx
",
&wparam
(
3
));
# const BN_ULONG *np
&mov
("
esi
",
&wparam
(
4
));
# const BN_ULONG *n0
#&mov ("edi",&wparam(5)); # int num
&lea
("
esi
",
&wparam
(
0
));
# put aside pointer to argument block
&lea
("
edx
",
&wparam
(
1
));
# load ap
&mov
("
ebp
","
esp
");
# saved stack pointer!
&add
("
edi
",
2
);
# extra two words on top of tp
&neg
("
edi
");
&lea
("
esp
",
&DWP
(
-
$frame
,"
esp
","
edi
",
4
));
# alloca($frame+4*(num+2))
&neg
("
edi
");
&and
("
esp
",
-
4096
);
# minimize TLB utilization
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov
("
eax
","
esp
");
&sub
("
eax
","
edx
");
&and
("
eax
",
2047
);
&sub
("
esp
","
eax
");
# this aligns sp and ap modulo 2048
&xor
("
edx
","
esp
");
&and
("
edx
",
2048
);
&xor
("
edx
",
2048
);
&sub
("
esp
","
edx
");
# this splits them apart modulo 4096
&and
("
esp
",
-
64
);
# align to cache line
################################# load argument block...
&mov
("
eax
",
&DWP
(
0
*
4
,"
esi
"));
# BN_ULONG *rp
&mov
("
ebx
",
&DWP
(
1
*
4
,"
esi
"));
# const BN_ULONG *ap
&mov
("
ecx
",
&DWP
(
2
*
4
,"
esi
"));
# const BN_ULONG *bp
&mov
("
edx
",
&DWP
(
3
*
4
,"
esi
"));
# const BN_ULONG *np
&mov
("
esi
",
&DWP
(
4
*
4
,"
esi
"));
# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov
("
esi
",
&DWP
(
0
,"
esi
"));
# pull n0[0]
&mov
(
$_rp
,"
eax
");
# ... save a copy of argument block
...
...
@@ -131,7 +148,7 @@ $mask="mm7";
&psrlq
(
$car1
,
32
);
&inc
(
$j
);
# j++
&set_label
("
1st
");
&set_label
("
1st
"
,
16
);
&pmuludq
(
$acc0
,
$mul0
);
# ap[j]*bp[0]
&pmuludq
(
$acc1
,
$mul1
);
# np[j]*m1
&paddq
(
$car0
,
$acc0
);
# +=c0
...
...
@@ -250,11 +267,11 @@ if (0) {
&xor
("
eax
","
eax
");
# signal "not fast enough [yet]"
&jmp
(
&label
("
just_leave
"));
# While the below code provides competitive performance for
# all key lengthes on modern
cores, it's still a tad slower
#
for >=2048-bits keys on *elder* CPUs:-( "Competitive" means
#
compared to the original integer-only assembler. 512-bit
#
RSA sign is better by >=30%, but that's about all one can
# say about all CPUs...
# all key lengthes on modern
Intel cores, it's still more
#
than 10% slower for 4096-bit key elsewhere:-( "Competitive"
#
means compared to the original integer-only assembler.
#
512-bit RSA sign is better by ~40%, but that's about all
#
one can
say about all CPUs...
}
else
{
$inp
=
"
esi
";
# integer path uses these registers differently
$word
=
"
edi
";
...
...
@@ -496,13 +513,13 @@ $sbit=$num;
&mov
(
$carry
,"
edx
");
&mul
(
$word
);
# ap[j]*ap[i]
&add
("
eax
",
$carry
);
&lea
(
$
j
,
&DWP
(
1
,
$j
));
&lea
(
$
carry
,
&DWP
(
0
,"
eax
","
eax
"
));
&adc
("
edx
",
0
);
&lea
(
$carry
,
&DWP
(
0
,
$sbit
,"
eax
",
2
));
&shr
("
eax
",
31
);
&cmp
(
$carry
,
$sbit
);
&add
(
$carry
,
&DWP
(
$frame
,"
esp
",
$j
,
4
));
# +=tp[j]
&lea
(
$j
,
&DWP
(
1
,
$j
));
&adc
("
eax
",
0
);
&add
(
$carry
,
&DWP
(
$frame
-
4
,"
esp
",
$j
,
4
));
# +=tp[j]
&add
(
$carry
,
$sbit
);
&adc
("
eax
",
0
);
&cmp
(
$j
,
$_num
);
&mov
(
&DWP
(
$frame
-
4
,"
esp
",
$j
,
4
),
$carry
);
# tp[j]=
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录