Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
89f1eb82
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
大约 1 年 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
89f1eb82
编写于
11月 12, 2012
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
aes-586.pl: Atom-specific optimization, +44/29%, minor improvement on others.
vpaes-x86.pl: minor performance squeeze.
上级
f717abd7
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
180 addition
and
174 deletion
+180
-174
crypto/aes/asm/aes-586.pl
crypto/aes/asm/aes-586.pl
+135
-128
crypto/aes/asm/vpaes-x86.pl
crypto/aes/asm/vpaes-x86.pl
+45
-46
未找到文件。
crypto/aes/asm/aes-586.pl
浏览文件 @
89f1eb82
...
...
@@ -103,11 +103,12 @@
# byte for 128-bit key.
#
# ECB encrypt ECB decrypt CBC large chunk
# P4 56[60] 84[100] 23
# AMD K8 48[44] 70[79] 18
# PIII 41[50] 61[91] 24
# Core 2 32[38] 45[70] 18.5
# Pentium 120 160 77
# P4 52[54] 83[95] 23
# AMD K8 46[41] 66[70] 18
# PIII 41[50] 60[77] 24
# Core 2 31[36] 45[64] 18.5
# Atom 76[100] 96[138] 60
# Pentium 115 150 77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
#
...
...
@@ -476,24 +477,25 @@ sub enctransform()
my
$tmp
=
$tbl
;
my
$r2
=
$key
;
&mov
(
$acc
,
$s
[
$i
]);
&and
(
$acc
,
0x80808080
);
&mov
(
$tmp
,
$acc
);
&shr
(
$tmp
,
7
);
&and
(
$tmp
,
$s
[
$i
]);
&lea
(
$r2
,
&DWP
(
0
,
$s
[
$i
],
$s
[
$i
]));
&sub
(
$acc
,
$tmp
);
&mov
(
$acc
,
$tmp
);
&shr
(
$tmp
,
7
);
&and
(
$r2
,
0xfefefefe
);
&
and
(
$acc
,
0x1b1b1b1b
);
&
sub
(
$acc
,
$tmp
);
&mov
(
$tmp
,
$s
[
$i
]);
&and
(
$acc
,
0x1b1b1b1b
);
&rotr
(
$tmp
,
16
);
&xor
(
$acc
,
$r2
);
# r2
&mov
(
$r2
,
$s
[
$i
]);
&xor
(
$s
[
$i
],
$acc
);
# r0 ^ r2
&rotr
(
$r2
,
16
+
8
);
&xor
(
$acc
,
$tmp
);
&rotl
(
$s
[
$i
],
24
);
&xor
(
$acc
,
$r2
);
&mov
(
$tmp
,
0x80808080
)
if
(
$i
!=
1
);
&xor
(
$s
[
$i
],
$acc
);
# ROTATE(r2^r0,24) ^ r2
&rotr
(
$tmp
,
16
);
&xor
(
$s
[
$i
],
$tmp
);
&rotr
(
$tmp
,
8
);
&xor
(
$s
[
$i
],
$tmp
);
}
&function_begin_B
("
_x86_AES_encrypt_compact
");
...
...
@@ -526,6 +528,7 @@ sub enctransform()
&enccompact
(
1
,
$tbl
,
$s1
,
$s2
,
$s3
,
$s0
,
1
);
&enccompact
(
2
,
$tbl
,
$s2
,
$s3
,
$s0
,
$s1
,
1
);
&enccompact
(
3
,
$tbl
,
$s3
,
$s0
,
$s1
,
$s2
,
1
);
&mov
(
$tbl
,
0x80808080
);
&enctransform
(
2
);
&enctransform
(
3
);
&enctransform
(
0
);
...
...
@@ -607,82 +610,84 @@ sub sse_enccompact()
&pshufw
("
mm5
","
mm4
",
0x0d
);
# 15,14,11,10
&movd
("
eax
","
mm1
");
# 5, 4, 1, 0
&movd
("
ebx
","
mm5
");
# 15,14,11,10
&mov
(
$__key
,
$key
);
&movz
(
$acc
,
&LB
("
eax
"));
# 0
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 0
&pshufw
("
mm2
","
mm0
",
0x0d
);
# 7, 6, 3, 2
&movz
("
edx
",
&HB
("
eax
"));
# 1
&pshufw
("
mm2
","
mm0
",
0x0d
);
# 7, 6, 3, 2
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 0
&movz
(
$key
,
&LB
("
ebx
"));
# 10
&movz
("
edx
",
&BP
(
-
128
,
$tbl
,"
edx
",
1
));
# 1
&shl
("
edx
",
8
);
# 1
&shr
("
eax
",
16
);
# 5, 4
&shl
("
edx
",
8
);
# 1
&movz
(
$acc
,
&
LB
("
ebx
"));
# 10
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 10
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 10
&movz
(
$
key
,
&HB
("
ebx
"));
# 11
&shl
(
$acc
,
16
);
# 10
&or
("
ecx
",
$acc
);
# 10
&pshufw
("
mm6
","
mm4
",
0x08
);
# 13,12, 9, 8
&movz
(
$acc
,
&HB
("
ebx
"));
# 11
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 11
&or
("
ecx
",
$acc
);
# 10
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 11
&movz
(
$key
,
&HB
("
eax
"));
# 5
&shl
(
$acc
,
24
);
# 11
&or
("
edx
",
$acc
);
# 11
&shr
("
ebx
",
16
);
# 15,14
&or
("
edx
",
$acc
);
# 11
&movz
(
$acc
,
&
HB
("
eax
"));
# 5
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
#
5
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 5
&movz
(
$
key
,
&HB
("
ebx
"));
# 1
5
&shl
(
$acc
,
8
);
# 5
&or
("
ecx
",
$acc
);
# 5
&movz
(
$acc
,
&
HB
("
ebx
"));
# 15
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 15
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 15
&movz
(
$
key
,
&LB
("
eax
"));
# 4
&shl
(
$acc
,
24
);
# 15
&or
("
ecx
",
$acc
);
# 15
&movd
("
mm0
","
ecx
");
# t[0] collected
&movz
(
$acc
,
&
LB
("
eax
"));
# 4
&movz
(
"
ecx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
#
4
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 4
&movz
(
$key
,
&LB
("
ebx
"));
# 1
4
&movd
("
eax
","
mm2
");
# 7, 6, 3, 2
&movz
(
$acc
,
&LB
("
ebx
"));
# 14
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 14
&shl
(
$acc
,
16
);
# 14
&movd
("
mm0
","
ecx
");
# t[0] collected
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 14
&movz
(
$key
,
&HB
("
eax
"));
# 3
&shl
("
ecx
",
16
);
# 14
&movd
("
ebx
","
mm6
");
# 13,12, 9, 8
&or
("
ecx
",
$acc
);
# 14
&movd
("
ebx
","
mm6
");
# 13,12, 9, 8
&movz
(
$acc
,
&HB
("
eax
"));
# 3
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 3
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 3
&movz
(
$key
,
&HB
("
ebx
"));
# 9
&shl
(
$acc
,
24
);
# 3
&or
("
ecx
",
$acc
);
# 3
&movz
(
$acc
,
&
HB
("
ebx
"));
# 9
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 9
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 9
&movz
(
$
key
,
&LB
("
ebx
"));
# 8
&shl
(
$acc
,
8
);
# 9
&shr
("
ebx
",
16
);
# 13,12
&or
("
ecx
",
$acc
);
# 9
&movd
("
mm1
","
ecx
");
# t[1] collected
&movz
(
$acc
,
&LB
("
ebx
"));
# 8
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 8
&shr
("
ebx
",
16
);
# 13,12
&movz
(
$acc
,
&LB
("
eax
"));
# 2
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 2
&shl
(
$acc
,
16
);
# 2
&or
("
ecx
",
$acc
);
# 2
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 8
&movz
(
$key
,
&LB
("
eax
"));
# 2
&shr
("
eax
",
16
);
# 7, 6
&movd
("
mm1
","
ecx
");
# t[1] collected
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 2
&movz
(
$key
,
&HB
("
eax
"));
# 7
&shl
("
ecx
",
16
);
# 2
&and
("
eax
",
0xff
);
# 6
&or
("
ecx
",
$acc
);
# 2
&punpckldq
("
mm0
","
mm1
");
# t[0,1] collected
&movz
(
$acc
,
&
HB
("
eax
"));
# 7
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 7
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 7
&movz
(
$
key
,
&HB
("
ebx
"));
# 13
&shl
(
$acc
,
24
);
# 7
&or
("
ecx
",
$acc
);
# 7
&and
("
eax
",
0xff
);
# 6
&and
("
ebx
",
0xff
);
# 12
&movz
("
eax
",
&BP
(
-
128
,
$tbl
,"
eax
",
1
));
# 6
&or
("
ecx
",
$acc
);
# 7
&shl
("
eax
",
16
);
# 6
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 13
&or
("
edx
","
eax
");
# 6
&movz
(
$acc
,
&HB
("
ebx
"));
# 13
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 13
&shl
(
$acc
,
8
);
# 13
&or
("
ecx
",
$acc
);
# 13
&movd
("
mm4
","
ecx
");
# t[2] collected
&and
("
ebx
",
0xff
);
# 12
&movz
("
ebx
",
&BP
(
-
128
,
$tbl
,"
ebx
",
1
));
# 12
&or
("
ecx
",
$acc
);
# 13
&or
("
edx
","
ebx
");
# 12
&mov
(
$key
,
$__key
);
&movd
("
mm4
","
ecx
");
# t[2] collected
&movd
("
mm5
","
edx
");
# t[3] collected
&punpckldq
("
mm4
","
mm5
");
# t[2,3] collected
...
...
@@ -1270,30 +1275,30 @@ sub dectransform()
my
$tp4
=
@s
[(
$i
+
3
)
%
4
];
$tp4
=
@s
[
3
]
if
(
$i
==
1
);
my
$tp8
=
$tbl
;
&mov
(
$
acc
,
$s
[
$i
]
);
&and
(
$
acc
,
0x80808080
);
&mov
(
$
tmp
,
$acc
);
&mov
(
$
tmp
,
0x80808080
);
&and
(
$
tmp
,
$s
[
$i
]
);
&mov
(
$
acc
,
$tmp
);
&shr
(
$tmp
,
7
);
&lea
(
$tp2
,
&DWP
(
0
,
$s
[
$i
],
$s
[
$i
]));
&sub
(
$acc
,
$tmp
);
&and
(
$tp2
,
0xfefefefe
);
&and
(
$acc
,
0x1b1b1b1b
);
&xor
(
$
acc
,
$tp2
);
&mov
(
$t
p2
,
$acc
);
&xor
(
$
tp2
,
$acc
);
&mov
(
$t
mp
,
0x80808080
);
&and
(
$
acc
,
0x80808080
);
&mov
(
$
tmp
,
$acc
);
&and
(
$
tmp
,
$tp2
);
&mov
(
$
acc
,
$tmp
);
&shr
(
$tmp
,
7
);
&lea
(
$tp4
,
&DWP
(
0
,
$tp2
,
$tp2
));
&sub
(
$acc
,
$tmp
);
&and
(
$tp4
,
0xfefefefe
);
&and
(
$acc
,
0x1b1b1b1b
);
&xor
(
$tp2
,
$s
[
$i
]);
# tp2^tp1
&xor
(
$
acc
,
$tp4
);
&mov
(
$t
p4
,
$acc
);
&xor
(
$
tp4
,
$acc
);
&mov
(
$t
mp
,
0x80808080
);
&and
(
$
acc
,
0x80808080
);
&mov
(
$
tmp
,
$acc
);
&and
(
$
tmp
,
$tp4
);
&mov
(
$
acc
,
$tmp
);
&shr
(
$tmp
,
7
);
&lea
(
$tp8
,
&DWP
(
0
,
$tp4
,
$tp4
));
&sub
(
$acc
,
$tmp
);
...
...
@@ -1305,13 +1310,13 @@ sub dectransform()
&xor
(
$s
[
$i
],
$tp2
);
&xor
(
$tp2
,
$tp8
);
&rotl
(
$tp2
,
24
);
&xor
(
$s
[
$i
],
$tp4
);
&xor
(
$tp4
,
$tp8
);
&rotl
(
$tp
4
,
16
);
&rotl
(
$tp
2
,
24
);
&xor
(
$s
[
$i
],
$tp8
);
# ^= tp8^(tp4^tp1)^(tp2^tp1)
&rotl
(
$tp
8
,
8
);
&rotl
(
$tp
4
,
16
);
&xor
(
$s
[
$i
],
$tp2
);
# ^= ROTATE(tp8^tp2^tp1,24)
&rotl
(
$tp8
,
8
);
&xor
(
$s
[
$i
],
$tp4
);
# ^= ROTATE(tp8^tp4^tp1,16)
&mov
(
$s
[
0
],
$__s0
)
if
(
$i
==
2
);
#prefetch $s0
&mov
(
$s
[
1
],
$__s1
)
if
(
$i
==
3
);
#prefetch $s1
...
...
@@ -1389,85 +1394,87 @@ sub dectransform()
sub
sse_deccompact
()
{
&pshufw
("
mm1
","
mm0
",
0x0c
);
# 7, 6, 1, 0
&pshufw
("
mm5
","
mm4
",
0x09
);
# 13,12,11,10
&movd
("
eax
","
mm1
");
# 7, 6, 1, 0
&movd
("
ebx
","
mm5
");
# 13,12,11,10
&mov
(
$__key
,
$key
);
&pshufw
("
mm5
","
mm4
",
0x09
);
# 13,12,11,10
&movz
(
$acc
,
&LB
("
eax
"));
# 0
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 0
&movd
("
ebx
","
mm5
");
# 13,12,11,10
&movz
("
edx
",
&HB
("
eax
"));
# 1
&pshufw
("
mm2
","
mm0
",
0x06
);
# 3, 2, 5, 4
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 0
&movz
(
$key
,
&LB
("
ebx
"));
# 10
&movz
("
edx
",
&BP
(
-
128
,
$tbl
,"
edx
",
1
));
# 1
&shr
("
eax
",
16
);
# 7, 6
&shl
("
edx
",
8
);
# 1
&pshufw
("
mm2
","
mm0
",
0x06
);
# 3, 2, 5, 4
&movz
(
$acc
,
&LB
("
ebx
"));
# 10
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 10
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 10
&movz
(
$key
,
&HB
("
ebx
"));
# 11
&shl
(
$acc
,
16
);
# 10
&pshufw
("
mm6
","
mm4
",
0x03
);
# 9, 8,15,14
&or
("
ecx
",
$acc
);
# 10
&shr
("
eax
",
16
);
# 7, 6
&movz
(
$acc
,
&HB
("
ebx
"));
# 11
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 11
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 11
&movz
(
$key
,
&HB
("
eax
"));
# 7
&shl
(
$acc
,
24
);
# 11
&or
("
edx
",
$acc
);
# 11
&shr
("
ebx
",
16
);
# 13,12
&or
("
edx
",
$acc
);
# 11
&pshufw
("
mm6
","
mm4
",
0x03
);
# 9, 8,15,14
&movz
(
$acc
,
&HB
("
eax
"));
# 7
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 7
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 7
&movz
(
$key
,
&HB
("
ebx
"));
# 13
&shl
(
$acc
,
24
);
# 7
&or
("
ecx
",
$acc
);
# 7
&movz
(
$acc
,
&
HB
("
ebx
"));
# 13
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 13
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 13
&movz
(
$
key
,
&LB
("
eax
"));
# 6
&shl
(
$acc
,
8
);
# 13
&movd
("
eax
","
mm2
");
# 3, 2, 5, 4
&or
("
ecx
",
$acc
);
# 13
&movd
("
mm0
","
ecx
");
# t[0] collected
&movz
(
$acc
,
&LB
("
eax
"));
# 6
&movd
("
eax
","
mm2
");
# 3, 2, 5, 4
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 6
&shl
("
ecx
",
16
);
# 6
&movz
(
$acc
,
&LB
("
ebx
"));
# 12
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 6
&movz
(
$key
,
&LB
("
ebx
"));
# 12
&shl
(
$acc
,
16
);
# 6
&movd
("
ebx
","
mm6
");
# 9, 8,15,14
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 12
&movd
("
mm0
","
ecx
");
# t[0] collected
&movz
("
ecx
",
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 12
&movz
(
$key
,
&LB
("
eax
"));
# 4
&or
("
ecx
",
$acc
);
# 12
&movz
(
$acc
,
&
LB
("
eax
"));
# 4
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
#
4
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 4
&movz
(
$
key
,
&LB
("
ebx
"));
# 1
4
&or
("
edx
",
$acc
);
# 4
&movz
(
$acc
,
&
LB
("
ebx
"));
# 14
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 14
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 14
&movz
(
$
key
,
&HB
("
eax
"));
# 5
&shl
(
$acc
,
16
);
# 14
&shr
("
eax
",
16
);
# 3, 2
&or
("
edx
",
$acc
);
# 14
&movd
("
mm1
","
edx
");
# t[1] collected
&movz
(
$acc
,
&HB
("
eax
"));
# 5
&movz
("
edx
",
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 5
&shl
("
edx
",
8
);
# 5
&movz
(
$acc
,
&HB
("
ebx
"));
# 15
&shr
("
eax
",
16
);
# 3, 2
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 15
&shl
(
$acc
,
24
);
# 15
&or
("
edx
",
$acc
);
# 15
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 5
&movz
(
$key
,
&HB
("
ebx
"));
# 15
&shr
("
ebx
",
16
);
# 9, 8
&shl
(
$acc
,
8
);
# 5
&movd
("
mm1
","
edx
");
# t[1] collected
&movz
("
edx
",
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 15
&movz
(
$key
,
&HB
("
ebx
"));
# 9
&shl
("
edx
",
24
);
# 15
&and
("
ebx
",
0xff
);
# 8
&or
("
edx
",
$acc
);
# 15
&punpckldq
("
mm0
","
mm1
");
# t[0,1] collected
&movz
(
$acc
,
&
HB
("
ebx
"));
# 9
&movz
(
$
acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 9
&movz
(
$acc
,
&
BP
(
-
128
,
$tbl
,
$key
,
1
));
# 9
&movz
(
$
key
,
&LB
("
eax
"));
# 2
&shl
(
$acc
,
8
);
# 9
&or
("
ecx
",
$acc
);
# 9
&and
("
ebx
",
0xff
);
# 8
&movz
("
eax
",
&HB
("
eax
"));
# 3
&movz
("
ebx
",
&BP
(
-
128
,
$tbl
,"
ebx
",
1
));
# 8
&or
("
ecx
",
$acc
);
# 9
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$key
,
1
));
# 2
&or
("
edx
","
ebx
");
# 8
&movz
(
$acc
,
&LB
("
eax
"));
# 2
&movz
(
$acc
,
&BP
(
-
128
,
$tbl
,
$acc
,
1
));
# 2
&shl
(
$acc
,
16
);
# 2
&or
("
edx
",
$acc
);
# 2
&movd
("
mm4
","
edx
");
# t[2] collected
&movz
("
eax
",
&HB
("
eax
"));
# 3
&movz
("
eax
",
&BP
(
-
128
,
$tbl
,"
eax
",
1
));
# 3
&or
("
edx
",
$acc
);
# 2
&shl
("
eax
",
24
);
# 3
&or
("
ecx
","
eax
");
# 3
&mov
(
$key
,
$__key
);
&movd
("
mm4
","
edx
");
# t[2] collected
&movd
("
mm5
","
ecx
");
# t[3] collected
&punpckldq
("
mm4
","
mm5
");
# t[2,3] collected
...
...
@@ -2865,32 +2872,32 @@ sub deckey()
{
my
(
$i
,
$key
,
$tp1
,
$tp2
,
$tp4
,
$tp8
)
=
@_
;
my
$tmp
=
$tbl
;
&mov
(
$acc
,
$tp1
);
&and
(
$acc
,
0x80808080
);
&mov
(
$tmp
,
$acc
);
&shr
(
$tmp
,
7
);
&mov
(
$tmp
,
0x80808080
);
&and
(
$tmp
,
$tp1
);
&lea
(
$tp2
,
&DWP
(
0
,
$tp1
,
$tp1
));
&mov
(
$acc
,
$tmp
);
&shr
(
$tmp
,
7
);
&sub
(
$acc
,
$tmp
);
&and
(
$tp2
,
0xfefefefe
);
&and
(
$acc
,
0x1b1b1b1b
);
&xor
(
$
acc
,
$tp2
);
&mov
(
$t
p2
,
$acc
);
&xor
(
$
tp2
,
$acc
);
&mov
(
$t
mp
,
0x80808080
);
&and
(
$acc
,
0x80808080
);
&mov
(
$tmp
,
$acc
);
&shr
(
$tmp
,
7
);
&and
(
$tmp
,
$tp2
);
&lea
(
$tp4
,
&DWP
(
0
,
$tp2
,
$tp2
));
&mov
(
$acc
,
$tmp
);
&shr
(
$tmp
,
7
);
&sub
(
$acc
,
$tmp
);
&and
(
$tp4
,
0xfefefefe
);
&and
(
$acc
,
0x1b1b1b1b
);
&xor
(
$tp2
,
$tp1
);
# tp2^tp1
&xor
(
$
acc
,
$tp4
);
&mov
(
$t
p4
,
$acc
);
&xor
(
$
tp4
,
$acc
);
&mov
(
$t
mp
,
0x80808080
);
&and
(
$acc
,
0x80808080
);
&mov
(
$tmp
,
$acc
);
&shr
(
$tmp
,
7
);
&and
(
$tmp
,
$tp4
);
&lea
(
$tp8
,
&DWP
(
0
,
$tp4
,
$tp4
));
&mov
(
$acc
,
$tmp
);
&shr
(
$tmp
,
7
);
&xor
(
$tp4
,
$tp1
);
# tp4^tp1
&sub
(
$acc
,
$tmp
);
&and
(
$tp8
,
0xfefefefe
);
...
...
crypto/aes/asm/vpaes-x86.pl
浏览文件 @
89f1eb82
...
...
@@ -27,9 +27,9 @@
#
# aes-586.pl vpaes-x86.pl
#
# Core 2(**) 2
9.1/42.3/18.3 22.0/25.6
(***)
# Nehalem 27.9/40.4/18.1 10.
3/12.0
# Atom
102./119./60.1 64.5/85.3
(***)
# Core 2(**) 2
8.1/41.4/18.3 21.9/25.2
(***)
# Nehalem 27.9/40.4/18.1 10.
2/11.9
# Atom
70.7/92.1/60.1 61.1/81.0
(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
...
...
@@ -40,8 +40,8 @@
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +
32%/65
% improvement on Core 2
# and +
58%/40
% on Atom (as implied, over "hyper-threading-safe"
# pshufb, yet it's respectable +
28%/64
% improvement on Core 2
# and +
15
% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
...
...
@@ -183,35 +183,35 @@ $k_dsbo=0x2c0; # decryption sbox final output
&movdqa
("
xmm1
","
xmm6
")
&movdqa
("
xmm2
",
&QWP
(
$k_ipt
,
$const
));
&pandn
("
xmm1
","
xmm0
");
&movdqu
("
xmm5
",
&QWP
(
0
,
$key
));
&psrld
("
xmm1
",
4
);
&pand
("
xmm0
","
xmm6
");
&movdqu
("
xmm5
",
&QWP
(
0
,
$key
));
&pshufb
("
xmm2
","
xmm0
");
&movdqa
("
xmm0
",
&QWP
(
$k_ipt
+
16
,
$const
));
&pshufb
("
xmm0
","
xmm1
");
&pxor
("
xmm2
","
xmm5
");
&p
xor
("
xmm0
","
xmm2
"
);
&p
srld
("
xmm1
",
4
);
&add
(
$key
,
16
);
&pshufb
("
xmm0
","
xmm1
");
&lea
(
$base
,
&DWP
(
$k_mc_backward
,
$const
));
&pxor
("
xmm0
","
xmm2
");
&jmp
(
&label
("
enc_entry
"));
&set_label
("
enc_loop
",
16
);
# middle of middle round
&movdqa
("
xmm4
",
&QWP
(
$k_sb1
,
$const
));
# 4 : sb1u
&pshufb
("
xmm4
","
xmm2
");
# 4 = sb1u
&pxor
("
xmm4
","
xmm5
");
# 4 = sb1u + k
&movdqa
("
xmm0
",
&QWP
(
$k_sb1
+
16
,
$const
));
# 0 : sb1t
&pshufb
("
xmm4
","
xmm2
");
# 4 = sb1u
&pshufb
("
xmm0
","
xmm3
");
# 0 = sb1t
&pxor
("
xmm
0
","
xmm4
");
# 0 = A
&pxor
("
xmm
4
","
xmm5
");
# 4 = sb1u + k
&movdqa
("
xmm5
",
&QWP
(
$k_sb2
,
$const
));
# 4 : sb2u
&p
shufb
("
xmm5
","
xmm2
");
# 4 = sb2u
&p
xor
("
xmm0
","
xmm4
");
# 0 = A
&movdqa
("
xmm1
",
&QWP
(
-
0x40
,
$base
,
$magic
));
# .Lk_mc_forward[]
&pshufb
("
xmm5
","
xmm2
");
# 4 = sb2u
&movdqa
("
xmm2
",
&QWP
(
$k_sb2
+
16
,
$const
));
# 2 : sb2t
&pshufb
("
xmm2
","
xmm3
");
# 2 = sb2t
&pxor
("
xmm2
","
xmm5
");
# 2 = 2A
&movdqa
("
xmm4
",
&QWP
(
0
,
$base
,
$magic
));
# .Lk_mc_backward[]
&pshufb
("
xmm2
","
xmm3
");
# 2 = sb2t
&movdqa
("
xmm3
","
xmm0
");
# 3 = A
&pxor
("
xmm2
","
xmm5
");
# 2 = 2A
&pshufb
("
xmm0
","
xmm1
");
# 0 = B
&add
(
$key
,
16
);
# next key
&pxor
("
xmm0
","
xmm2
");
# 0 = 2A+B
...
...
@@ -220,30 +220,30 @@ $k_dsbo=0x2c0; # decryption sbox final output
&pxor
("
xmm3
","
xmm0
");
# 3 = 2A+B+D
&pshufb
("
xmm0
","
xmm1
");
# 0 = 2B+C
&and
(
$magic
,
0x30
);
# ... mod 4
&pxor
("
xmm0
","
xmm3
");
# 0 = 2A+3B+C+D
&sub
(
$round
,
1
);
# nr--
&pxor
("
xmm0
","
xmm3
");
# 0 = 2A+3B+C+D
&set_label
("
enc_entry
");
# top of round
&movdqa
("
xmm1
","
xmm6
");
# 1 : i
&movdqa
("
xmm5
",
&QWP
(
$k_inv
+
16
,
$const
));
# 2 : a/k
&pandn
("
xmm1
","
xmm0
");
# 1 = i<<4
&psrld
("
xmm1
",
4
);
# 1 = i
&pand
("
xmm0
","
xmm6
");
# 0 = k
&movdqa
("
xmm5
",
&QWP
(
$k_inv
+
16
,
$const
));
# 2 : a/k
&pshufb
("
xmm5
","
xmm0
");
# 2 = a/k
&pxor
("
xmm0
","
xmm1
");
# 0 = j
&movdqa
("
xmm3
","
xmm7
");
# 3 : 1/i
&pxor
("
xmm0
","
xmm1
");
# 0 = j
&pshufb
("
xmm3
","
xmm1
");
# 3 = 1/i
&pxor
("
xmm3
","
xmm5
");
# 3 = iak = 1/i + a/k
&movdqa
("
xmm4
","
xmm7
");
# 4 : 1/j
&pxor
("
xmm3
","
xmm5
");
# 3 = iak = 1/i + a/k
&pshufb
("
xmm4
","
xmm0
");
# 4 = 1/j
&pxor
("
xmm4
","
xmm5
");
# 4 = jak = 1/j + a/k
&movdqa
("
xmm2
","
xmm7
");
# 2 : 1/iak
&pxor
("
xmm4
","
xmm5
");
# 4 = jak = 1/j + a/k
&pshufb
("
xmm2
","
xmm3
");
# 2 = 1/iak
&pxor
("
xmm2
","
xmm0
");
# 2 = io
&movdqa
("
xmm3
","
xmm7
");
# 3 : 1/jak
&
movdqu
("
xmm5
",
&QWP
(
0
,
$key
));
&
pxor
("
xmm2
","
xmm0
");
# 2 = io
&pshufb
("
xmm3
","
xmm4
");
# 3 = 1/jak
&movdqu
("
xmm5
",
&QWP
(
0
,
$key
));
&pxor
("
xmm3
","
xmm1
");
# 3 = jo
&jnz
(
&label
("
enc_loop
"));
...
...
@@ -265,8 +265,8 @@ $k_dsbo=0x2c0; # decryption sbox final output
## Same API as encryption core.
##
&function_begin_B
("
_vpaes_decrypt_core
");
&mov
(
$round
,
&DWP
(
240
,
$key
));
&lea
(
$base
,
&DWP
(
$k_dsbd
,
$const
));
&mov
(
$round
,
&DWP
(
240
,
$key
));
&movdqa
("
xmm1
","
xmm6
");
&movdqa
("
xmm2
",
&QWP
(
$k_dipt
-
$k_dsbd
,
$base
));
&pandn
("
xmm1
","
xmm0
");
...
...
@@ -292,62 +292,61 @@ $k_dsbo=0x2c0; # decryption sbox final output
## Inverse mix columns
##
&movdqa
("
xmm4
",
&QWP
(
-
0x20
,
$base
));
# 4 : sb9u
&movdqa
("
xmm1
",
&QWP
(
-
0x10
,
$base
));
# 0 : sb9t
&pshufb
("
xmm4
","
xmm2
");
# 4 = sb9u
&pshufb
("
xmm1
","
xmm3
");
# 0 = sb9t
&pxor
("
xmm4
","
xmm0
");
&movdqa
("
xmm0
",
&QWP
(
-
0x10
,
$base
));
# 0 : sb9t
&pshufb
("
xmm0
","
xmm3
");
# 0 = sb9t
&pxor
("
xmm0
","
xmm4
");
# 0 = ch
&add
(
$key
,
16
);
# next round key
&pxor
("
xmm1
","
xmm4
");
# 0 = ch
&pshufb
("
xmm0
","
xmm5
");
# MC ch
&movdqa
("
xmm4
",
&QWP
(
0
,
$base
));
# 4 : sbdu
&pshufb
("
xmm1
","
xmm5
");
# MC ch
&pshufb
("
xmm4
","
xmm2
");
# 4 = sbdu
&pxor
("
xmm4
","
xmm0
");
# 4 = ch
&movdqa
("
xmm0
",
&QWP
(
0x10
,
$base
));
# 0 : sbdt
&pxor
("
xmm4
","
xmm1
");
# 4 = ch
&pshufb
("
xmm0
","
xmm3
");
# 0 = sbdt
&pxor
("
xmm0
","
xmm4
");
# 0 = ch
&sub
(
$round
,
1
);
# nr--
&pxor
("
xmm0
","
xmm4
");
# 0 = ch
&pshufb
("
xmm0
","
xmm5
");
# MC ch
&movdqa
("
xmm4
",
&QWP
(
0x20
,
$base
));
# 4 : sbbu
&pshufb
("
xmm0
","
xmm5
");
# MC ch
&movdqa
("
xmm1
",
&QWP
(
0x30
,
$base
));
# 0 : sbbt
&pshufb
("
xmm4
","
xmm2
");
# 4 = sbbu
&pshufb
("
xmm1
","
xmm3
");
# 0 = sbbt
&pxor
("
xmm4
","
xmm0
");
# 4 = ch
&movdqa
("
xmm0
",
&QWP
(
0x30
,
$base
));
# 0 : sbbt
&pshufb
("
xmm0
","
xmm3
");
# 0 = sbbt
&pxor
("
xmm0
","
xmm4
");
# 0 = ch
&pxor
("
xmm1
","
xmm4
");
# 0 = ch
&pshufb
("
xmm0
","
xmm5
");
# MC ch
&movdqa
("
xmm4
",
&QWP
(
0x40
,
$base
));
# 4 : sbeu
&pshufb
("
xmm4
","
xmm2
");
# 4 = sbeu
&pxor
("
xmm4
","
xmm0
");
# 4 = ch
&pshufb
("
xmm1
","
xmm5
");
# MC ch
&movdqa
("
xmm0
",
&QWP
(
0x50
,
$base
));
# 0 : sbet
&pshufb
("
xmm4
","
xmm2
");
# 4 = sbeu
&pshufb
("
xmm0
","
xmm3
");
# 0 = sbet
&pxor
("
xmm0
","
xmm4
");
# 0 = ch
&palignr
("
xmm5
","
xmm5
",
12
);
&pxor
("
xmm4
","
xmm1
");
# 4 = ch
&pxor
("
xmm0
","
xmm4
");
# 0 = ch
&set_label
("
dec_entry
");
# top of round
&movdqa
("
xmm1
","
xmm6
");
# 1 : i
&pandn
("
xmm1
","
xmm0
");
# 1 = i<<4
&movdqa
("
xmm2
",
&QWP
(
$k_inv
+
16
,
$const
));
# 2 : a/k
&psrld
("
xmm1
",
4
);
# 1 = i
&pand
("
xmm0
","
xmm6
");
# 0 = k
&movdqa
("
xmm2
",
&QWP
(
$k_inv
+
16
,
$const
));
# 2 : a/k
&pshufb
("
xmm2
","
xmm0
");
# 2 = a/k
&pxor
("
xmm0
","
xmm1
");
# 0 = j
&movdqa
("
xmm3
","
xmm7
");
# 3 : 1/i
&pxor
("
xmm0
","
xmm1
");
# 0 = j
&pshufb
("
xmm3
","
xmm1
");
# 3 = 1/i
&pxor
("
xmm3
","
xmm2
");
# 3 = iak = 1/i + a/k
&movdqa
("
xmm4
","
xmm7
");
# 4 : 1/j
&pxor
("
xmm3
","
xmm2
");
# 3 = iak = 1/i + a/k
&pshufb
("
xmm4
","
xmm0
");
# 4 = 1/j
&pxor
("
xmm4
","
xmm2
");
# 4 = jak = 1/j + a/k
&movdqa
("
xmm2
","
xmm7
");
# 2 : 1/iak
&pshufb
("
xmm2
","
xmm3
");
# 2 = 1/iak
&pxor
("
xmm2
","
xmm0
");
# 2 = io
&movdqa
("
xmm3
","
xmm7
");
# 3 : 1/jak
&pxor
("
xmm2
","
xmm0
");
# 2 = io
&pshufb
("
xmm3
","
xmm4
");
# 3 = 1/jak
&pxor
("
xmm3
","
xmm1
");
# 3 = jo
&movdqu
("
xmm0
",
&QWP
(
0
,
$key
));
&pxor
("
xmm3
","
xmm1
");
# 3 = jo
&jnz
(
&label
("
dec_loop
"));
# middle of last round
...
...
@@ -542,12 +541,12 @@ $k_dsbo=0x2c0; # decryption sbox final output
## %xmm0: b+c+d b+c b a
##
&function_begin_B
("
_vpaes_schedule_192_smear
");
&pshufd
("
xmm0
","
xmm6
",
0x80
);
# d c 0 0 -> c 0 0 0
&pxor
("
xmm6
","
xmm0
");
# -> c+d c 0 0
&pshufd
("
xmm1
","
xmm6
",
0x80
);
# d c 0 0 -> c 0 0 0
&pshufd
("
xmm0
","
xmm7
",
0xFE
);
# b a _ _ -> b b b a
&pxor
("
xmm6
","
xmm1
");
# -> c+d c 0 0
&pxor
("
xmm1
","
xmm1
");
&pxor
("
xmm6
","
xmm0
");
# -> b+c+d b+c b a
&movdqa
("
xmm0
","
xmm6
");
&pxor
("
xmm1
","
xmm1
");
&movhlps
("
xmm6
","
xmm1
");
# clobber low side with zeros
&ret
();
&function_end_B
("
_vpaes_schedule_192_smear
");
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录