Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
20c04a13
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
20c04a13
编写于
4月 26, 2007
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reimplement rc4-586.pl, relicense rc4-x86_64.pl.
上级
a291745e
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
154 addition
and
202 deletion
+154
-202
crypto/rc4/asm/rc4-586.pl
crypto/rc4/asm/rc4-586.pl
+143
-198
crypto/rc4/asm/rc4-x86_64.pl
crypto/rc4/asm/rc4-x86_64.pl
+11
-4
未找到文件。
crypto/rc4/asm/rc4-586.pl
浏览文件 @
20c04a13
#!/usr/local/bin/perl
#!/usr/bin/env perl
# ====================================================================
# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# At some point it became apparent that the original SSLeay RC4
# assembler implementation performs suboptimaly on latest IA-32
# assembler implementation performs suboptimal
l
y on latest IA-32
# microarchitectures. After re-tuning performance has changed as
# following:
#
# Pentium
+
0%
# Pentium III +1
7
%
# AMD +5
2
%(*)
# P4 +
18
0%(**)
# Pentium
-1
0%
# Pentium III +1
2
%
# AMD +5
0
%(*)
# P4 +
25
0%(**)
#
# (*) This number is actually a trade-off:-) It's possible to
# achieve +72%, but at the cost of -48% off PIII performance.
...
...
@@ -17,209 +24,136 @@
# For reference! This code delivers ~80% of rc4-amd64.pl
# performance on the same Opteron machine.
# (**) This number requires compressed key schedule set up by
# RC4_set_key and therefore doesn't apply to 0.9.7 [option for
# compressed key schedule is implemented in 0.9.8 and later,
# see commentary section in rc4_skey.c for further details].
# RC4_set_key [see commentary below for further details].
#
# <appro@fy.chalmers.se>
push
(
@INC
,"
perlasm
","
../../perlasm
");
$
0
=~
m/(.*[\/\\])[^\/\\]+$/
;
$dir
=
$
1
;
push
(
@INC
,"
${dir}
","
${dir}
../../perlasm
");
require
"
x86asm.pl
";
&asm_init
(
$ARGV
[
0
],"
rc4-586.pl
");
$x
=
"
eax
";
$y
=
"
ebx
";
$x
x
=
"
eax
";
$y
y
=
"
ebx
";
$tx
=
"
ecx
";
$ty
=
"
edx
";
$in
=
"
esi
";
$out
=
"
edi
";
$d
=
"
ebp
";
sub
RC4_loop
{
local
(
$n
,
$p
,
$char
)
=
@_
;
&comment
("
Round
$n
");
if
(
$char
)
{
if
(
$p
>=
0
)
{
&mov
(
$ty
,
&swtmp
(
2
));
&cmp
(
$ty
,
$in
);
&jbe
(
&label
("
finished
"));
&inc
(
$in
);
}
else
{
&add
(
$ty
,
8
);
&inc
(
$in
);
&cmp
(
$ty
,
$in
);
&jb
(
&label
("
finished
"));
&mov
(
&swtmp
(
2
),
$ty
);
}
}
# Moved out
# &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
&add
(
&LB
(
$y
),
&LB
(
$tx
));
&mov
(
$ty
,
&DWP
(
0
,
$d
,
$y
,
4
));
# XXX
&mov
(
&DWP
(
0
,
$d
,
$x
,
4
),
$ty
);
&add
(
$ty
,
$tx
);
&mov
(
&DWP
(
0
,
$d
,
$y
,
4
),
$tx
);
&and
(
$ty
,
0xff
);
&inc
(
&LB
(
$x
));
# NEXT ROUND
&mov
(
$tx
,
&DWP
(
0
,
$d
,
$x
,
4
))
if
$p
<
1
;
# NEXT ROUND
&mov
(
$ty
,
&DWP
(
0
,
$d
,
$ty
,
4
));
if
(
!
$char
)
{
#moved up into last round
if
(
$p
>=
1
)
{
&add
(
$out
,
8
)
}
&movb
(
&BP
(
$n
,"
esp
","",
0
),
&LB
(
$ty
));
}
else
{
# Note in+=8 has occured
&movb
(
&HB
(
$ty
),
&BP
(
-
1
,
$in
,"",
0
));
# XXX
&xorb
(
&LB
(
$ty
),
&HB
(
$ty
));
# XXX
&movb
(
&BP
(
$n
,
$out
,"",
0
),
&LB
(
$ty
));
}
$inp
=
"
esi
";
$out
=
"
ebp
";
$dat
=
"
edi
";
sub
RC4_loop
{
my
$i
=
shift
;
my
$func
=
(
$i
==
0
)?
*mov:*or
;
&add
(
&LB
(
$yy
),
&LB
(
$tx
));
&mov
(
$ty
,
&DWP
(
0
,
$dat
,
$yy
,
4
));
&mov
(
&DWP
(
0
,
$dat
,
$yy
,
4
),
$tx
);
&mov
(
&DWP
(
0
,
$dat
,
$xx
,
4
),
$ty
);
&add
(
$ty
,
$tx
);
&inc
(
&LB
(
$xx
));
&and
(
$ty
,
0xff
);
&ror
(
$out
,
8
)
if
(
$i
!=
0
);
if
(
$i
<
3
)
{
&mov
(
$tx
,
&DWP
(
0
,
$dat
,
$xx
,
4
));
}
else
{
&mov
(
$tx
,
&wparam
(
3
));
# reload [re-biased] out
}
&$func
(
$out
,
&DWP
(
0
,
$dat
,
$ty
,
4
));
}
&function_begin_B
("
RC4
");
{
local
(
$name
)
=
@_
;
&mov
(
$ty
,
&wparam
(
1
));
# len
&cmp
(
$ty
,
0
);
&jne
(
&label
("
proceed
"));
&ret
();
&set_label
("
proceed
");
&comment
("");
&push
("
ebp
");
&push
("
ebx
");
&push
("
esi
");
&xor
(
$x
,
$x
);
# avoid partial register stalls
&push
("
edi
");
&xor
(
$y
,
$y
);
# avoid partial register stalls
&mov
(
$d
,
&wparam
(
0
));
# key
&mov
(
$in
,
&wparam
(
2
));
&movb
(
&LB
(
$x
),
&BP
(
0
,
$d
,"",
1
));
&movb
(
&LB
(
$y
),
&BP
(
4
,
$d
,"",
1
));
&mov
(
$out
,
&wparam
(
3
));
&inc
(
&LB
(
$x
));
&stack_push
(
3
);
# 3 temp variables
&add
(
$d
,
8
);
# detect compressed schedule, see commentary section in rc4_skey.c...
# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
# as compressed key schedule is set up in 0.9.8 and later.
&cmp
(
&DWP
(
256
,
$d
),
-
1
);
&je
(
&label
("
RC4_CHAR
"));
&lea
(
$ty
,
&DWP
(
-
8
,
$ty
,
$in
));
# check for 0 length input
&mov
(
&swtmp
(
2
),
$ty
);
# this is now address to exit at
&mov
(
$tx
,
&DWP
(
0
,
$d
,
$x
,
4
));
&cmp
(
$ty
,
$in
);
&jb
(
&label
("
end
"));
# less than 8 bytes
&set_label
("
start
");
# filling DELAY SLOT
&add
(
$in
,
8
);
&RC4_loop
(
0
,
-
1
,
0
);
&RC4_loop
(
1
,
0
,
0
);
&RC4_loop
(
2
,
0
,
0
);
&RC4_loop
(
3
,
0
,
0
);
&RC4_loop
(
4
,
0
,
0
);
&RC4_loop
(
5
,
0
,
0
);
&RC4_loop
(
6
,
0
,
0
);
&RC4_loop
(
7
,
1
,
0
);
&comment
("
apply the cipher text
");
# xor the cipher data with input
#&add( $out, 8); #moved up into last round
&mov
(
$tx
,
&swtmp
(
0
));
&mov
(
$ty
,
&DWP
(
-
8
,
$in
,"",
0
));
&xor
(
$tx
,
$ty
);
&mov
(
$ty
,
&DWP
(
-
4
,
$in
,"",
0
));
&mov
(
&DWP
(
-
8
,
$out
,"",
0
),
$tx
);
&mov
(
$tx
,
&swtmp
(
1
));
&xor
(
$tx
,
$ty
);
&mov
(
$ty
,
&swtmp
(
2
));
# load end ptr;
&mov
(
&DWP
(
-
4
,
$out
,"",
0
),
$tx
);
&mov
(
$tx
,
&DWP
(
0
,
$d
,
$x
,
4
));
&cmp
(
$in
,
$ty
);
&jbe
(
&label
("
start
"));
&set_label
("
end
");
# There is quite a bit of extra crap in RC4_loop() for this
# first round
&RC4_loop
(
0
,
-
1
,
1
);
&RC4_loop
(
1
,
0
,
1
);
&RC4_loop
(
2
,
0
,
1
);
&RC4_loop
(
3
,
0
,
1
);
&RC4_loop
(
4
,
0
,
1
);
&RC4_loop
(
5
,
0
,
1
);
&RC4_loop
(
6
,
1
,
1
);
&jmp
(
&label
("
finished
"));
&align
(
16
);
# this is essentially Intel P4 specific codepath, see rc4_skey.c,
# and is engaged in 0.9.8 and later context...
&set_label
("
RC4_CHAR
");
&lea
(
$ty
,
&DWP
(
0
,
$in
,
$ty
));
&mov
(
&swtmp
(
2
),
$ty
);
&movz
(
$tx
,
&BP
(
0
,
$d
,
$x
));
# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
&function_begin
("
RC4
");
&mov
(
$dat
,
&wparam
(
0
));
# load key schedule pointer
&mov
(
$ty
,
&wparam
(
1
));
# load len
&mov
(
$inp
,
&wparam
(
2
));
# load inp
&mov
(
$out
,
&wparam
(
3
));
# load out
&xor
(
$xx
,
$xx
);
# avoid partial register stalls
&xor
(
$yy
,
$yy
);
&cmp
(
$ty
,
0
);
# safety net
&je
(
&label
("
abort
"));
&mov
(
&LB
(
$xx
),
&BP
(
0
,
$dat
));
# load key->x
&mov
(
&LB
(
$yy
),
&BP
(
4
,
$dat
));
# load key->y
&add
(
$dat
,
8
);
&lea
(
$tx
,
&DWP
(
0
,
$inp
,
$ty
));
&sub
(
$out
,
$inp
);
# re-bias out
&mov
(
&wparam
(
1
),
$tx
);
# save input+len
&inc
(
&LB
(
$xx
));
# detect compressed key schedule...
&cmp
(
&DWP
(
256
,
$dat
),
-
1
);
&je
(
&label
("
RC4_CHAR
"));
&mov
(
$tx
,
&DWP
(
0
,
$dat
,
$xx
,
4
));
&and
(
$ty
,
-
4
);
# how many 4-byte chunks?
&jz
(
&label
("
loop1
"));
&lea
(
$ty
,
&DWP
(
-
4
,
$inp
,
$ty
));
&mov
(
&wparam
(
2
),
$ty
);
# save input+(len/4)*4-4
&mov
(
&wparam
(
3
),
$out
);
# $out as accumulator in this loop
&set_label
("
loop4
",
16
);
for
(
$i
=
0
;
$i
<
4
;
$i
++
)
{
RC4_loop
(
$i
);
}
&ror
(
$out
,
8
);
&xor
(
$out
,
&DWP
(
0
,
$inp
));
&cmp
(
$inp
,
&wparam
(
2
));
# compare to input+(len/4)*4-4
&mov
(
&DWP
(
0
,
$tx
,
$inp
),
$out
);
# $tx holds re-biased out here
&lea
(
$inp
,
&DWP
(
4
,
$inp
));
&mov
(
$tx
,
&DWP
(
0
,
$dat
,
$xx
,
4
));
&jb
(
&label
("
loop4
"));
&cmp
(
$inp
,
&wparam
(
1
));
# compare to input+len
&je
(
&label
("
done
"));
&mov
(
$out
,
&wparam
(
3
));
# restore $out
&set_label
("
loop1
",
16
);
&add
(
&LB
(
$yy
),
&LB
(
$tx
));
&mov
(
$ty
,
&DWP
(
0
,
$dat
,
$yy
,
4
));
&mov
(
&DWP
(
0
,
$dat
,
$yy
,
4
),
$tx
);
&mov
(
&DWP
(
0
,
$dat
,
$xx
,
4
),
$ty
);
&add
(
$ty
,
$tx
);
&inc
(
&LB
(
$xx
));
&and
(
$ty
,
0xff
);
&mov
(
$ty
,
&DWP
(
0
,
$dat
,
$ty
,
4
));
&xor
(
&LB
(
$ty
),
&BP
(
0
,
$inp
));
&lea
(
$inp
,
&DWP
(
1
,
$inp
));
&mov
(
$tx
,
&DWP
(
0
,
$dat
,
$xx
,
4
));
&cmp
(
$inp
,
&wparam
(
1
));
# compare to input+len
&mov
(
&BP
(
-
1
,
$out
,
$inp
),
&LB
(
$ty
));
&jb
(
&label
("
loop1
"));
&jmp
(
&label
("
done
"));
# this is essentially Intel P4 specific codepath...
&set_label
("
RC4_CHAR
",
16
);
&movz
(
$tx
,
&BP
(
0
,
$dat
,
$xx
));
# strangely enough unrolled loop performs over 20% slower...
&set_label
("
RC4_CHAR_loop
");
&add
(
&LB
(
$y
),
&LB
(
$tx
));
&movz
(
$ty
,
&BP
(
0
,
$d
,
$
y
));
&mov
b
(
&BP
(
0
,
$d
,
$
y
),
&LB
(
$tx
));
&mov
b
(
&BP
(
0
,
$d
,
$
x
),
&LB
(
$ty
));
&set_label
("
cloop1
");
&add
(
&LB
(
$y
y
),
&LB
(
$tx
));
&movz
(
$ty
,
&BP
(
0
,
$d
at
,
$y
y
));
&mov
(
&BP
(
0
,
$dat
,
$y
y
),
&LB
(
$tx
));
&mov
(
&BP
(
0
,
$dat
,
$x
x
),
&LB
(
$ty
));
&add
(
&LB
(
$ty
),
&LB
(
$tx
));
&movz
(
$ty
,
&BP
(
0
,
$d
,
$ty
));
&add
(
&LB
(
$x
),
1
);
&xorb
(
&LB
(
$ty
),
&BP
(
0
,
$in
));
&lea
(
$in
,
&BP
(
1
,
$in
));
&movz
(
$tx
,
&BP
(
0
,
$d
,
$x
));
&cmp
(
$in
,
&swtmp
(
2
));
&movb
(
&BP
(
0
,
$out
),
&LB
(
$ty
));
&lea
(
$out
,
&BP
(
1
,
$out
));
&jb
(
&label
("
RC4_CHAR_loop
"));
&set_label
("
finished
");
&dec
(
$x
);
&stack_pop
(
3
);
&movb
(
&BP
(
-
4
,
$d
,"",
0
),
&LB
(
$y
));
&movb
(
&BP
(
-
8
,
$d
,"",
0
),
&LB
(
$x
));
}
&movz
(
$ty
,
&BP
(
0
,
$dat
,
$ty
));
&add
(
&LB
(
$xx
),
1
);
&xor
(
&LB
(
$ty
),
&BP
(
0
,
$inp
));
&lea
(
$inp
,
&BP
(
1
,
$inp
));
&movz
(
$tx
,
&BP
(
0
,
$dat
,
$xx
));
&cmp
(
$inp
,
&wparam
(
1
));
&mov
(
&BP
(
-
1
,
$out
,
$inp
),
&LB
(
$ty
));
&jb
(
&label
("
cloop1
"));
&set_label
("
done
");
&dec
(
&LB
(
$xx
));
&mov
(
&BP
(
-
4
,
$dat
),
&LB
(
$yy
));
# save key->y
&mov
(
&BP
(
-
8
,
$dat
),
&LB
(
$xx
));
# save key->x
&set_label
("
abort
");
&function_end
("
RC4
");
########################################################################
...
...
@@ -271,6 +205,17 @@ $idx="edx";
&jnc
(
&label
("
w2ndloop
"));
&jmp
(
&label
("
exit
"));
# Unlike all other x86 [and x86_64] implementations, Intel P4 core
# [including EM64T] was found to perform poorly with above "32-bit" key
# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
# schedule for x86[_64], because non-P4 implementations suffer from
# significant performance losses then, e.g. PIII exhibits >2x
# deterioration, and so does Opteron. In order to assure optimal
# all-round performance, we detect P4 at run-time and set up compressed
# key schedule, which is recognized by RC4 procedure.
&set_label
("
c1stloop
",
16
);
&mov
(
&BP
(
0
,
$out
,"
eax
"),
&LB
("
eax
"));
# key->data[i]=i;
&add
(
&LB
("
eax
"),
1
);
# i++;
...
...
@@ -315,9 +260,9 @@ $idx="edx";
&set_label
("
skip
");
&ret
();
&set_label
("
opts
",
64
);
&asciz
("
rc4(
8
x,int)
");
&asciz
("
rc4(
4
x,int)
");
&asciz
("
rc4(1x,char)
");
&asciz
("
RC4 for x86,
OpenSSL project
");
# RC4_version
&asciz
("
RC4 for x86,
CRYPTOGAMS by <appro
\@
openssl.org>
");
&align
(
64
);
&function_end_B
("
RC4_options
");
...
...
crypto/rc4/asm/rc4-x86_64.pl
浏览文件 @
20c04a13
...
...
@@ -2,8 +2,9 @@
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
...
...
@@ -58,7 +59,13 @@
# this CPU.
$output
=
shift
;
open
STDOUT
,"
| $^X ../perlasm/x86_64-xlate.pl
$output
";
$
0
=~
m/(.*[\/\\])[^\/\\]+$/
;
$dir
=
$
1
;
(
$xlate
=
"
${dir}
x86_64-xlate.pl
"
and
-
f
$xlate
)
or
(
$xlate
=
"
${dir}
../../perlasm/x86_64-xlate.pl
"
and
-
f
$xlate
)
or
die
"
can't locate x86_64-xlate.pl
";
open
STDOUT
,"
| $^X
$xlate
$output
";
$dat
=
"
%rdi
";
# arg1
$len
=
"
%rsi
";
# arg2
...
...
@@ -345,7 +352,7 @@ RC4_options:
.asciz "rc4(8x,int)"
.asciz "rc4(8x,char)"
.asciz "rc4(1x,char)"
.asciz "RC4 for x86_64,
OpenSSL project
"
.asciz "RC4 for x86_64,
CRYPTOGAMS by <appro\@openssl.org>
"
.align 64
.size RC4_options,.-RC4_options
___
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录