Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
760d2551
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
大约 1 年 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
760d2551
编写于
5月 24, 2011
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rc4-586.pl: 50% improvement on Core2 and 80% on Westmere.
上级
73ab3411
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
129 addition
and
8 deletion
+129
-8
crypto/rc4/asm/rc4-586.pl
crypto/rc4/asm/rc4-586.pl
+129
-8
未找到文件。
crypto/rc4/asm/rc4-586.pl
浏览文件 @
760d2551
...
...
@@ -28,6 +28,31 @@
#
# <appro@fy.chalmers.se>
# May 2011
#
# Optimize for Core2 and Westmere [and Opteron]. Current performance
# in cycles per processed byte (less is better) is:
#
# Pentium 10.2 # original numbers
# Pentium III 7.8(*)
# Intel P4 7.5
#
# Opteron 6.4/+14% # new MMX numbers
# Core2 5.8/+50%(**)
# Westmere 5.5/+80%(**)
# Sandy Bridge 5.4/0%
#
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
# but this specific code performs poorly on Core2. While below
# MMX code delivering 5.8 on Core2 performs at 8.0 on PIII:-(
# As PIII is not a "hot" CPU [anymore], I chose not to introduce
# PIII-specific code path, which is why MMX code path is quarded
# by SSE2 bit (see below), not MMX.
# (**) Performance vs. block size on Core2 and Westmere had a maximum
# at ... 64 bytes block size. And it was quite a maximum, 40-60%
# in comparison to largest 8KB block size. Above improvement
# coefficients are for the largest block size.
$
0
=~
m/(.*[\/\\])[^\/\\]+$/
;
$dir
=
$
1
;
push
(
@INC
,"
${dir}
","
${dir}
../../perlasm
");
require
"
x86asm.pl
";
...
...
@@ -62,6 +87,60 @@ sub RC4_loop {
&$func
(
$out
,
&DWP
(
0
,
$dat
,
$ty
,
4
));
}
if
(
$alt
=
0
)
{
# works ~5% faster on Atom and ~20% slower on Core2
my
@XX
=
(
$xx
,
$out
);
$RC4_loop_mmx
=
sub
{
my
$i
=
shift
;
&add
(
$yy
,
$tx
);
&movz
(
$yy
,
&LB
(
$yy
));
&lea
(
@XX
[
1
],
&DWP
(
1
,
@XX
[
0
]));
&psllq
("
mm1
",
8
*
((
$i
-
1
)
&
7
))
if
(
abs
(
$i
)
!=
1
);
&and
(
@XX
[
1
],
0xff
);
&mov
(
$ty
,
&DWP
(
0
,
$dat
,
$yy
,
4
));
&mov
(
&DWP
(
0
,
$dat
,
$yy
,
4
),
$tx
);
&mov
(
&DWP
(
0
,
$dat
,
$XX
[
0
],
4
),
$ty
);
&add
(
$ty
,
$tx
);
&movd
(
@XX
[
0
],"
mm7
")
if
(
$i
==
0
);
&movz
(
$ty
,
&LB
(
$ty
));
&mov
(
$tx
,
&DWP
(
0
,
$dat
,
@XX
[
1
],
4
));
&pxor
("
mm2
",
$i
==
1
?"
mm0
":"
mm1
")
if
(
$i
>=
0
);
&movq
("
mm0
",
&QWP
(
0
,
$inp
))
if
(
$i
<=
0
);
&movq
(
&QWP
(
-
8
,(
@XX
[
0
],
$inp
)),"
mm2
")
if
(
$i
==
0
);
&movd
(
$i
>
0
?"
mm1
":"
mm2
",
&DWP
(
0
,
$dat
,
$ty
,
4
));
push
(
@XX
,
shift
(
@XX
))
if
(
$i
>=
0
);
}
}
else
{
$RC4_loop_mmx
=
sub
{
my
$i
=
shift
;
&add
(
$yy
,
$tx
);
&movz
(
$yy
,
&LB
(
$yy
));
# (*)
&psllq
("
mm1
",
8
*
((
$i
-
1
)
&
7
))
if
(
abs
(
$i
)
!=
1
);
&mov
(
$ty
,
&DWP
(
0
,
$dat
,
$yy
,
4
));
&mov
(
&DWP
(
0
,
$dat
,
$yy
,
4
),
$tx
);
&mov
(
&DWP
(
0
,
$dat
,
$xx
,
4
),
$ty
);
&inc
(
$xx
);
&add
(
$ty
,
$tx
);
&movz
(
$xx
,
&LB
(
$xx
));
# (*)
&movz
(
$ty
,
&LB
(
$ty
));
# (*)
&pxor
("
mm2
",
$i
==
1
?"
mm0
":"
mm1
")
if
(
$i
>=
0
);
&movq
("
mm0
",
&QWP
(
0
,
$inp
))
if
(
$i
<=
0
);
&movq
(
&QWP
(
-
8
,(
$out
,
$inp
)),"
mm2
")
if
(
$i
==
0
);
&mov
(
$tx
,
&DWP
(
0
,
$dat
,
$xx
,
4
));
&movd
(
$i
>
0
?"
mm1
":"
mm2
",
&DWP
(
0
,
$dat
,
$ty
,
4
));
# (*) This is the key to Core2 and Westmere performance.
# Whithout movz out-of-order execution logic confuses
# itself and fails to reorder loads and stores. Problem
# appears to be fixed in Sandy Bridge...
}
}
&external_label
("
OPENSSL_ia32cap_P
");
# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
&function_begin
("
RC4
");
&mov
(
$dat
,
&wparam
(
0
));
# load key schedule pointer
...
...
@@ -94,11 +173,48 @@ sub RC4_loop {
&and
(
$ty
,
-
4
);
# how many 4-byte chunks?
&jz
(
&label
("
loop1
"));
&test
(
$ty
,
-
8
);
&mov
(
&wparam
(
3
),
$out
);
# $out as accumulator in these loops
&jz
(
&label
("
go4loop4
"));
&picmeup
(
$out
,"
OPENSSL_ia32cap_P
");
&bt
(
&DWP
(
0
,
$out
),
26
);
# check SSE2 bit [could have been MMX]
&jnc
(
&label
("
go4loop4
"));
&mov
(
$out
,
&wparam
(
3
))
if
(
!
$alt
);
&movd
("
mm7
",
&wparam
(
3
))
if
(
$alt
);
&and
(
$ty
,
-
8
);
&lea
(
$ty
,
&DWP
(
-
8
,
$inp
,
$ty
));
&mov
(
&wparam
(
2
),
$ty
);
&mov
(
&DWP
(
-
4
,
$dat
),
$ty
);
# save input+(len/8)*8-8
&$RC4_loop_mmx
(
-
1
);
&jmp
(
&label
("
loop_mmx_enter
"));
&set_label
("
loop_mmx
",
16
);
&$RC4_loop_mmx
(
0
);
&set_label
("
loop_mmx_enter
");
for
(
$i
=
1
;
$i
<
8
;
$i
++
)
{
&$RC4_loop_mmx
(
$i
);
}
&cmp
(
$inp
,
&DWP
(
-
4
,
$dat
));
&lea
(
$inp
,
&DWP
(
8
,
$inp
));
&jb
(
&label
("
loop_mmx
"));
&movd
(
$out
,"
mm7
")
if
(
$alt
);
&psllq
("
mm1
",
56
);
&pxor
("
mm2
","
mm1
");
&movq
(
&QWP
(
-
8
,
$out
,
$inp
),"
mm2
");
&emms
();
&cmp
(
$inp
,
&wparam
(
1
));
# compare to input+len
&je
(
&label
("
done
"));
&jmp
(
&label
("
loop1
"));
&set_label
("
go4loop4
",
16
);
&lea
(
$ty
,
&DWP
(
-
4
,
$inp
,
$ty
));
&mov
(
&wparam
(
2
),
$ty
);
# save input+(len/4)*4-4
&mov
(
&wparam
(
3
),
$out
);
# $out as accumulator in this loop
&set_label
("
loop4
"
,
16
);
&set_label
("
loop4
");
for
(
$i
=
0
;
$i
<
4
;
$i
++
)
{
RC4_loop
(
$i
);
}
&ror
(
$out
,
8
);
&xor
(
$out
,
&DWP
(
0
,
$inp
));
...
...
@@ -164,8 +280,6 @@ $idi="ebp";
$ido
=
"
ecx
";
$idx
=
"
edx
";
&external_label
("
OPENSSL_ia32cap_P
");
# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
&function_begin
("
RC4_set_key
");
&mov
(
$out
,
&wparam
(
0
));
# load key
...
...
@@ -254,14 +368,21 @@ $idx="edx";
&blindpop
("
eax
");
&lea
("
eax
",
&DWP
(
&label
("
opts
")
.
"
-
"
.
&label
("
pic_point
"),"
eax
"));
&picmeup
("
edx
","
OPENSSL_ia32cap_P
");
&bt
(
&DWP
(
0
,"
edx
"),
20
);
&jnc
(
&label
("
skip
"));
&add
("
eax
",
12
);
&set_label
("
skip
");
&mov
("
edx
",
&DWP
(
0
,"
edx
"));
&bt
("
edx
",
20
);
&jc
(
&label
("
1xchar
"));
&bt
("
edx
",
26
);
&jnc
(
&label
("
ret
"));
&add
("
eax
",
25
);
&ret
();
&set_label
("
1xchar
");
&add
("
eax
",
12
);
&set_label
("
ret
");
&ret
();
&set_label
("
opts
",
64
);
&asciz
("
rc4(4x,int)
");
&asciz
("
rc4(1x,char)
");
&asciz
("
rc4(8x,mmx)
");
&asciz
("
RC4 for x86, CRYPTOGAMS by <appro
\@
openssl.org>
");
&align
(
64
);
&function_end_B
("
RC4_options
");
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录