Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
48d2335d
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
大约 1 年 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
48d2335d
编写于
11月 27, 2006
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Non-SSE2 path to bn_mul_mont. But it's disabled, because it currently
doesn't give performance improvement.
上级
96ea4ae9
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
187 addition
and
41 deletion
+187
-41
crypto/bn/asm/x86-mont.pl
crypto/bn/asm/x86-mont.pl
+187
-41
未找到文件。
crypto/bn/asm/x86-mont.pl
浏览文件 @
48d2335d
...
...
@@ -2,8 +2,9 @@
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
...
...
@@ -31,12 +32,12 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&function_begin
("
bn_mul_mont
",
$sse2
?"
EXTRN
\t
_OPENSSL_ia32cap_P:DWORD
":"");
$i
=
"
e
b
x
";
$i
=
"
e
d
x
";
$j
=
"
ecx
";
$ap
=
"
esi
";
$rp
=
"
edi
";
$bp
=
"
edi
";
# overlapping variables!!!
$np
=
"
e
dx
";
$num
=
"
eb
p
";
$np
=
"
e
bp
";
$num
=
"
eb
x
";
$_rp
=
&DWP
(
4
*
0
,"
esp
");
# stack top layout
$_ap
=
&DWP
(
4
*
1
,"
esp
");
...
...
@@ -45,21 +46,13 @@ $_np=&DWP(4*3,"esp");
$_n0
=
&DWP
(
4
*
4
,"
esp
");
$_num
=
&DWP
(
4
*
5
,"
esp
");
$_sp
=
&DWP
(
4
*
6
,"
esp
");
$_bpend
=
&DWP
(
4
*
7
,"
esp
");
$frame
=
32
;
# size of above frame rounded up to 16n
$acc0
=
"
mm0
";
# mmx register bank layout
$acc1
=
"
mm1
";
$car0
=
"
mm2
";
$car1
=
"
mm3
";
$mul0
=
"
mm4
";
$mul1
=
"
mm5
";
$temp
=
"
mm6
";
$mask
=
"
mm7
";
if
(
$sse2
)
{
&picmeup
("
eax
","
OPENSSL_ia32cap_P
");
&bt
(
&DWP
(
0
,"
eax
"),
26
);
&jnc
(
&label
("
non_sse2
"));
&xor
("
eax
","
eax
");
&mov
("
edi
",
&wparam
(
5
));
# int num
&cmp
("
edi
",
3
);
&jb
(
&label
("
just_leave
"));
################################# load argument block...
&mov
("
eax
",
&wparam
(
0
));
# BN_ULONG *rp
...
...
@@ -67,16 +60,14 @@ if($sse2) {
&mov
("
ecx
",
&wparam
(
2
));
# const BN_ULONG *bp
&mov
("
edx
",
&wparam
(
3
));
# const BN_ULONG *np
&mov
("
esi
",
&wparam
(
4
));
# const BN_ULONG *n0
&mov
(
$num
,
&wparam
(
5
));
# int num
#&mov ("edi"
,&wparam(5)); # int num
&mov
("
edi
","
esp
");
# saved stack pointer!
&add
(
$num
,
1
);
# extra word on top of tp
&neg
(
$num
);
&lea
("
esp
",
&DWP
(
-
$frame
,"
esp
",
$num
,
4
));
# alloca($frame+8*($num+1))
&neg
(
$num
);
&and
("
esp
",
-
1024
);
# minimize TLB utilization
&sub
(
$num
,
1
);
# num is restored to its original value
# and will remain constant from now...
&mov
("
ebp
","
esp
");
# saved stack pointer!
&add
("
edi
",
2
);
# extra two words on top of tp
&neg
("
edi
");
&lea
("
esp
",
&DWP
(
-
$frame
,"
esp
","
edi
",
4
));
# alloca($frame+4*(num+2))
&neg
("
edi
");
&and
("
esp
",
-
4096
);
# minimize TLB utilization
&mov
("
esi
",
&DWP
(
0
,"
esi
"));
# pull n0[0]
&mov
(
$_rp
,"
eax
");
# ... save a copy of argument block
...
...
@@ -84,8 +75,23 @@ if($sse2) {
&mov
(
$_bp
,"
ecx
");
&mov
(
$_np
,"
edx
");
&mov
(
$_n0
,"
esi
");
#&mov ($_num,$num); # redundant in sse2 context
&mov
(
$_sp
,"
edi
");
# saved stack pointer!
&lea
(
$num
,
&DWP
(
-
2
,"
edi
"));
# num is restored to its original value
#&mov ($_num,$num); # redundant as $num is not reused
&mov
(
$_sp
,"
ebp
");
# saved stack pointer!
if
(
$sse2
)
{
$acc0
=
"
mm0
";
# mmx register bank layout
$acc1
=
"
mm1
";
$car0
=
"
mm2
";
$car1
=
"
mm3
";
$mul0
=
"
mm4
";
$mul1
=
"
mm5
";
$temp
=
"
mm6
";
$mask
=
"
mm7
";
&picmeup
("
eax
","
OPENSSL_ia32cap_P
");
&bt
(
&DWP
(
0
,"
eax
"),
26
);
&jnc
(
&label
("
non_sse2
"));
&mov
("
eax
",
-
1
);
&movd
(
$mask
,"
eax
");
# mask 32 lower bits
...
...
@@ -195,7 +201,153 @@ if($sse2) {
&jl
(
&label
("
outer
"));
&emms
();
# done with mmx bank
&jmp
(
&label
("
common_tail
"));
&set_label
("
non_sse2
",
16
);
}
if
(
1
)
{
&mov
("
esp
",
$_sp
);
&xor
("
eax
","
eax
");
# signal "not fast enough [yet]"
&jmp
(
&label
("
just_leave
"));
# The code below gives ~15% improvement on 512-bit benchmark
# *only*:-( On all other key lengths it's slower for up to 20%.
# This is because the original code path holds down the overall
# amount of multiplications by ~25% by deploying bn_sqr_words.
# In other words, for the code below to be competitive,
# dedicated squaring procedure is a must...
}
else
{
$inp
=
"
esi
";
# integer path uses these registers differently
$word
=
"
edi
";
$carry
=
"
ebp
";
&sub
(
$num
,
1
);
# non-SSE2 path uses num-1
&mov
(
$inp
,
$_ap
);
&mov
(
$word
,
$_bp
);
&lea
("
eax
",
&DWP
(
4
,
$word
,
$num
,
4
));
# &bp[num]
&mov
(
$word
,
&DWP
(
0
,
$word
));
# bp[0]
&mov
(
$_bpend
,"
eax
");
&xor
(
$j
,
$j
);
&xor
("
edx
","
edx
");
&set_label
("
mull
",
16
);
&mov
("
eax
",
&DWP
(
0
,
$inp
,
$j
,
4
));
# ap[j]
&mov
(
$carry
,"
edx
");
&mul
(
$word
);
# ap[j]*bp[0]
&lea
(
$j
,
&DWP
(
1
,
$j
));
&add
("
eax
",
$carry
);
&adc
("
edx
",
0
);
&mov
(
&DWP
(
$frame
-
4
,"
esp
",
$j
,
4
),"
eax
");
# tp[j]=
&cmp
(
$j
,
$num
);
&jb
(
&label
("
mull
"));
&mov
("
eax
",
&DWP
(
0
,
$inp
,
$num
,
4
));
# ap[num-1]
&mov
(
$carry
,"
edx
");
&mul
(
$word
);
# ap[num-1]*bp[0]
&add
("
eax
",
$carry
);
&adc
("
edx
",
0
);
&mov
(
$word
,
$_n0
);
&mov
(
$inp
,
$_np
);
&imul
(
$word
,
&DWP
(
$frame
,"
esp
"));
# n0*tp[0]
&mov
(
&DWP
(
$frame
,"
esp
",
$num
,
4
),"
eax
");
# tp[num-1]=
&xor
(
$j
,
$j
);
&mov
(
&DWP
(
$frame
+
4
,"
esp
",
$num
,
4
),"
edx
");
# tp[num]=
&mov
(
&DWP
(
$frame
+
8
,"
esp
",
$num
,
4
),
$j
);
# tp[num+1]=
&mov
("
eax
",
&DWP
(
0
,
$inp
));
# np[0]
&mul
(
$word
);
# np[0]*m
&add
("
eax
",
&DWP
(
$frame
,"
esp
"));
# +=tp[0]
&adc
("
edx
",
0
);
&mov
(
$j
,
1
);
&jmp
(
&label
("
2ndmadd
"));
&set_label
("
1stmadd
",
16
);
&mov
("
eax
",
&DWP
(
0
,
$inp
,
$j
,
4
));
# ap[j]
&mov
(
$carry
,"
edx
");
&mul
(
$word
);
# ap[j]*bp[i]
&lea
(
$j
,
&DWP
(
1
,
$j
));
&add
("
eax
",
&DWP
(
$frame
-
4
,"
esp
",
$j
,
4
));
# +=tp[j]
&adc
("
edx
",
0
);
&add
("
eax
",
$carry
);
&adc
("
edx
",
0
);
&mov
(
&DWP
(
$frame
-
4
,"
esp
",
$j
,
4
),"
eax
");
# tp[j]=
&cmp
(
$j
,
$num
);
&jb
(
&label
("
1stmadd
"));
&mov
("
eax
",
&DWP
(
0
,
$inp
,
$num
,
4
));
# ap[num-1]
&mov
(
$carry
,"
edx
");
&mul
(
$word
);
# ap[num-1]*bp[i]
&add
("
eax
",
&DWP
(
$frame
,"
esp
",
$num
,
4
));
# +=tp[num-1]
&adc
("
edx
",
0
);
&add
("
eax
",
$carry
);
&adc
("
edx
",
0
);
&mov
(
$word
,
$_n0
);
&mov
(
$inp
,
$_np
);
&imul
(
$word
,
&DWP
(
$frame
,"
esp
"));
# n0*tp[0]
&xor
(
$j
,
$j
);
&add
("
edx
",
&DWP
(
$frame
+
4
,"
esp
",
$num
,
4
));
# carry+=tp[num]
&mov
(
&DWP
(
$frame
,"
esp
",
$num
,
4
),"
eax
");
# tp[num-1]=
&adc
(
$j
,
0
);
&mov
(
&DWP
(
$frame
+
4
,"
esp
",
$num
,
4
),"
edx
");
# tp[num]=
&mov
(
&DWP
(
$frame
+
8
,"
esp
",
$num
,
4
),
$j
);
# tp[num+1]=
&mov
("
eax
",
&DWP
(
0
,
$inp
));
# np[0]
&mul
(
$word
);
# np[0]*m
&add
("
eax
",
&DWP
(
$frame
,"
esp
"));
# +=tp[0]
&adc
("
edx
",
0
);
&mov
(
$j
,
1
);
&set_label
("
2ndmadd
",
16
);
&mov
("
eax
",
&DWP
(
0
,
$inp
,
$j
,
4
));
# np[j]
&mov
(
$carry
,"
edx
");
&mul
(
$word
);
# np[j]*m
&lea
(
$j
,
&DWP
(
1
,
$j
));
&add
("
eax
",
&DWP
(
$frame
-
4
,"
esp
",
$j
,
4
));
# +=tp[j]
&adc
("
edx
",
0
);
&add
("
eax
",
$carry
);
&adc
("
edx
",
0
);
&mov
(
&DWP
(
$frame
-
8
,"
esp
",
$j
,
4
),"
eax
");
# tp[j-1]=
&cmp
(
$j
,
$num
);
&jb
(
&label
("
2ndmadd
"));
&mov
("
eax
",
&DWP
(
0
,
$inp
,
$num
,
4
));
# np[num-1]
&mov
(
$carry
,"
edx
");
&mul
(
$word
);
# np[num-1]*m
&add
("
eax
",
&DWP
(
$frame
,"
esp
",
$num
,
4
));
# +=tp[num-1]
&adc
("
edx
",
0
);
&add
("
eax
",
$carry
);
&adc
("
edx
",
0
);
&mov
(
&DWP
(
$frame
-
4
,"
esp
",
$num
,
4
),"
eax
");
# tp[num-2]=
&xor
("
eax
","
eax
");
&add
("
edx
",
&DWP
(
$frame
+
4
,"
esp
",
$num
,
4
));
# carry+=tp[num]
&adc
("
eax
",
&DWP
(
$frame
+
8
,"
esp
",
$num
,
4
));
# +=tp[num+1]
&mov
(
&DWP
(
$frame
,"
esp
",
$num
,
4
),"
edx
");
# tp[num-1]=
&mov
(
&DWP
(
$frame
+
4
,"
esp
",
$num
,
4
),"
eax
");
# tp[num]=
&mov
(
$carry
,
$_bp
);
# &bp[i]
&add
(
$carry
,
4
);
&cmp
(
$carry
,
$_bpend
);
&je
(
&label
("
x86done
"));
&mov
(
$word
,
&DWP
(
0
,
$carry
));
# bp[i]
&mov
(
$inp
,
$_ap
);
&mov
(
$_bp
,
$carry
);
# &bp[++i]
&xor
(
$j
,
$j
);
&xor
("
edx
","
edx
");
&jmp
(
&label
("
1stmadd
"));
&set_label
("
x86done
",
16
);
&mov
(
$np
,
$_np
);
# make adjustments for tail processing
&add
(
$num
,
1
);
}
&set_label
("
common_tail
",
16
);
&mov
("
esi
",
&DWP
(
$frame
,"
esp
",
$num
,
4
));
# load upmost overflow bit
&mov
(
$rp
,
$_rp
);
# load result pointer
# [$ap and $bp are zapped]
...
...
@@ -206,15 +358,15 @@ if($sse2) {
&mov
("
eax
",
&DWP
(
$frame
,"
esp
",
$j
,
4
));
&cmp
("
eax
",
&DWP
(
0
,
$np
,
$j
,
4
));
# tp[num-1]-np[num-1]?
&jae
(
&label
("
sub
"));
# if taken CF is cleared
&set_label
("
copy
");
&set_label
("
copy
"
,
16
);
&mov
("
eax
",
&DWP
(
$frame
,"
esp
",
$j
,
4
));
&mov
(
&DWP
(
0
,
$rp
,
$j
,
4
),"
eax
");
# rp[i]=tp[i]
&mov
(
&DWP
(
$frame
,"
esp
",
$j
,
4
),
$j
);
# zap temporary vector
&dec
(
$j
);
&jge
(
&label
("
copy
"));
&jmp
(
&label
("
exit
_sse2
"));
&jmp
(
&label
("
exit
"));
&set_label
("
sub
",
4
);
&set_label
("
sub
",
16
);
&mov
("
eax
",
&DWP
(
$frame
,"
esp
",
$i
,
4
));
&sbb
("
eax
",
&DWP
(
0
,
$np
,
$i
,
4
));
&mov
(
&DWP
(
0
,
$rp
,
$i
,
4
),"
eax
");
# rp[i]=tp[i]-np[i]
...
...
@@ -224,21 +376,15 @@ if($sse2) {
&lea
(
$j
,
&DWP
(
-
1
,
$num
));
# j=num-1
&sbb
("
esi
",
0
);
# esi holds upmost overflow bit
&jc
(
&label
("
copy
"));
&set_label
("
zap
");
&set_label
("
zap
"
,
16
);
&mov
(
&DWP
(
$frame
,"
esp
",
$j
,
4
),
$i
);
# zap temporary vector
&dec
(
$j
);
&jge
(
&label
("
zap
"));
&set_label
("
exit
_sse2
"
);
&set_label
("
exit
",
4
);
&mov
("
esp
",
$_sp
);
# pull saved stack pointer
&mov
("
eax
",
1
);
&jmp
(
&label
("
leave
"));
&set_label
("
non_sse2
");
}
&xor
("
eax
","
eax
");
# zero signals "not implemented [yet]"
&set_label
("
leave
");
&set_label
("
just_leave
");
&function_end
("
bn_mul_mont
");
&asm_finish
();
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录