Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
btwise
openssl
提交
7c9e81be
O
openssl
项目概览
btwise
/
openssl
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
7c9e81be
编写于
2月 14, 2013
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[aesni-]sha1-x86_64.pl: code refresh.
上级
c4558efb
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
193 addition
and
145 deletion
+193
-145
crypto/aes/asm/aesni-sha1-x86_64.pl
crypto/aes/asm/aesni-sha1-x86_64.pl
+90
-65
crypto/sha/asm/sha1-x86_64.pl
crypto/sha/asm/sha1-x86_64.pl
+103
-80
未找到文件。
crypto/aes/asm/aesni-sha1-x86_64.pl
浏览文件 @
7c9e81be
...
...
@@ -21,22 +21,22 @@
# subroutine:
#
# AES-128-CBC +SHA1 stitch gain
# Westmere 3.77[+5.
6] 9.37 6.65
+41%
# Sandy Bridge 5.05[+5.
2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60
%)
# Ivy Bridge 5.05[+4.
7] 9.75 5.59
+74%
# Bulldozer 5.77[+6.
1] 11.87 6.47 +83
%
# Westmere 3.77[+5.
5] 9.26 6.58
+41%
# Sandy Bridge 5.05[+5.
0(6.2)] 10.06(11.21) 6.09(7.05) +65%(+59
%)
# Ivy Bridge 5.05[+4.
6] 9.65 5.54
+74%
# Bulldozer 5.77[+6.
0] 11.72 6.37 +84
%
#
# AES-192-CBC
# Westmere 4.51 10.
11 6.97 +45
%
# Sandy Bridge 6.05 11.
25(12.35) 6.34(7.27) +77
%(+70%)
# Ivy Bridge 6.05 10.
75 6.07 +77
%
# Bulldozer 6.89 12.
99 7.02 +85
%
# Westmere 4.51 10.
00 6.87 +46
%
# Sandy Bridge 6.05 11.
06(12.21) 6.11(7.20) +81
%(+70%)
# Ivy Bridge 6.05 10.
65 6.07 +75
%
# Bulldozer 6.89 12.
84 6.96 +84
%
#
# AES-256-CBC
# Westmere 5.25 10.
85 7.25 +50
%
# Sandy Bridge 7.05 12.
25(13.35) 7.06(7.70) +74%(+73
%)
# Ivy Bridge 7.05 11.
75 7.12 +65
%
# Bulldozer 8.00 1
4.10 8.24 +71
%
# Westmere 5.25 10.
74 7.19 +49
%
# Sandy Bridge 7.05 12.
06(13.21) 7.12(7.68) +69%(+72
%)
# Ivy Bridge 7.05 11.
65 7.12 +64
%
# Bulldozer 8.00 1
3.95 8.25 +69
%
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
# background information. Above numbers in parentheses are SSSE3
...
...
@@ -120,7 +120,7 @@ my @X=map("%xmm$_",(4..7,0..3));
my
@Tx
=
map
("
%xmm
$_
",(
8
..
10
));
my
@V
=
(
$A
,
$B
,
$C
,
$D
,
$E
)
=
("
%eax
","
%ebx
","
%ecx
","
%edx
","
%ebp
");
# size optimization
my
@T
=
("
%esi
","
%edi
");
my
$j
=
0
;
my
$jj
=
0
;
my
$r
=
0
;
my
$sn
=
0
;
my
$j
=
0
;
my
$jj
=
0
;
my
$r
=
0
;
my
$sn
=
0
;
my
$rx
=
0
;
my
$K_XX_XX
=
"
%r11
";
my
(
$iv
,
$in
,
$rndkey0
)
=
map
("
%xmm
$_
",(
11
..
13
));
my
@rndkey
=
("
%xmm14
","
%xmm15
");
...
...
@@ -188,6 +188,9 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
mov $C,@T[1]
xor $D,@T[1]
and @T[1],@T[0]
movdqa 64($K_XX_XX),@X[2] # pbswap mask
movdqa 0($K_XX_XX),@Tx[1] # K_00_19
...
...
@@ -473,21 +476,27 @@ sub Xtail_ssse3()
foreach
(
@insns
)
{
eval
;
}
}
sub
body_00_19
()
{
sub
body_00_19
()
{
# ((c^d)&b)^d
# on start @T[0]=(c^d)&b
return
&body_20_39
()
if
(
$rx
==
19
);
$rx
++
;
use
integer
;
my
(
$k
,
$n
);
my
@r
=
(
'
($a,$b,$c,$d,$e)=@V;
'
.
'
&add ($e,eval(4*($j&15))."(%rsp)");
',
# X[]+K xfer
'
&xor ($c,$d);
',
'
&mov (@T[1],$a);
',
# $b in next round
'
&$_rol ($a,5);
',
'
&and (@T[0],$c);
',
# ($b&($c^$d))
'
&xor ($c,$d);
',
# restore $c
'
&xor (@T[0],$d);
',
'
&add ($e,$a);
',
'
&$_ror ($b,$j?7:2);
',
# $b>>>2
'
&add ($e,@T[0]);
'
.
'
$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
'
&xor (@T[0],$d);
',
'
&mov (@T[1],$a);
',
# $b for next round
'
&add ($e,eval(4*($j&15))."(%rsp)");
',
# X[]+K xfer
'
&xor ($b,$c);
',
# $c^$d for next round
'
&$_rol ($a,5);
',
'
&add ($e,@T[0]);
',
'
&and (@T[1],$b);
',
# ($b&($c^$d)) for next round
'
&xor ($b,$c);
',
# restore $b
'
&add ($e,$a);
'
.
'
$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
);
$n
=
scalar
(
@r
);
$k
=
((
$jj
+
1
)
*
12
/20)*20*$n/
12
;
# 12 aesencs per these 20 rounds
...
...
@@ -496,48 +505,60 @@ sub body_00_19 () {
return
@r
;
}
sub
body_20_39
()
{
sub
body_20_39
()
{
# b^d^c
# on entry @T[0]=b^d
return
&body_40_59
()
if
(
$rx
==
39
);
$rx
++
;
use
integer
;
my
(
$k
,
$n
);
my
@r
=
(
'
($a,$b,$c,$d,$e)=@V;
'
.
'
&add ($e,eval(4*($j++&15))."(%rsp)");
',
# X[]+K xfer
'
&xor (@T[0],$d);
',
# ($b^$d)
'
&mov (@T[1],$a);
',
# $b in next round
'
&add ($e,eval(4*($j&15))."(%rsp)");
',
# X[]+K xfer
'
&xor (@T[0],$d) if($j==19);
'
.
'
&xor (@T[0],$c) if($j> 19);
',
# ($b^$d^$c)
'
&mov (@T[1],$a);
',
# $b for next round
'
&$_rol ($a,5);
',
'
&xor (@T[0],$c);
',
# ($b^$d^$c)
'
&add ($e,$a);
',
'
&add ($e,@T[0]);
',
'
&xor (@T[1],$c) if ($j< 79);
',
# $b^$d for next round
'
&$_ror ($b,7);
',
# $b>>>2
'
&add ($e,
@T[0]);
'
.
'
unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
'
&add ($e,
$a);
'
.
'
$j++;
unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
);
$n
=
scalar
(
@r
);
$k
=
((
$jj
+
1
)
*
8
/20)*20*$n/
8
;
# 8 aesencs per these 20 rounds
@r
[
$k%$n
]
.=
'
&$aesenc();
'
if
(
$jj
==
$k
/
$n
);
@r
[
$k%$n
]
.=
'
&$aesenc();
'
if
(
$jj
==
$k
/
$n
&&
$rx
!=
20
);
$jj
++
;
return
@r
;
}
sub
body_40_59
()
{
sub
body_40_59
()
{
# ((b^c)&(c^d))^c
# on entry @T[0]=(b^c), (c^=d)
$rx
++
;
use
integer
;
my
(
$k
,
$n
);
my
@r
=
(
'
($a,$b,$c,$d,$e)=@V;
'
.
'
&mov (@T[1],$c);
',
'
&xor ($c,$d);
',
'
&add ($e,eval(4*($j++&15))."(%rsp)");
',
# X[]+K xfer
'
&and (@T[1],$d);
',
'
&and (@T[0],$c);
',
# ($b&($c^$d))
'
&add ($e,eval(4*($j&15))."(%rsp)");
',
# X[]+K xfer
'
&and (@T[0],$c) if ($j>=40);
',
# (b^c)&(c^d)
'
&xor ($c,$d) if ($j>=40);
',
# restore $c
'
&$_ror ($b,7);
',
# $b>>>2
'
&add ($e,@T[1]);
',
'
&mov (@T[1],$a);
',
# $b in next round
'
&mov (@T[1],$a);
',
# $b for next round
'
&xor (@T[0],$c);
',
'
&$_rol ($a,5);
',
'
&add ($e,@T[0]);
',
'
&xor ($c,$d);
',
# restore $c
'
&add ($e,$a);
'
.
'
unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
'
&xor (@T[1],$c) if ($j==59);
'
.
'
&xor (@T[1],$b) if ($j< 59);
',
# b^c for next round
'
&xor ($b,$c) if ($j< 59);
',
# c^d for next round
'
&add ($e,$a);
'
.
'
$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
);
$n
=
scalar
(
@r
);
$k
=
((
$jj
+
1
)
*
12
/20)*20*$n/
12
;
# 12 aesencs per these 20 rounds
@r
[
$k%$n
]
.=
'
&$aesenc();
'
if
(
$jj
==
$k
/
$n
);
@r
[
$k%$n
]
.=
'
&$aesenc();
'
if
(
$jj
==
$k
/
$n
&&
$rx
!=
40
);
$jj
++
;
return
@r
;
}
...
...
@@ -583,8 +604,11 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
mov $C,@T[1]
mov $D,12($ctx)
xor $D,@T[1]
mov $E,16($ctx)
and @T[1],@T[0]
jmp .Loop_ssse3
.align 16
...
...
@@ -639,7 +663,7 @@ $code.=<<___;
.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
___
$j
=
$jj
=
$r
=
$sn
=
0
;
$j
=
$jj
=
$r
=
$sn
=
$rx
=
0
;
if
(
$avx
)
{
my
(
$in0
,
$out
,
$len
,
$key
,
$ivp
,
$ctx
,
$inp
)
=
("
%rdi
","
%rsi
","
%rdx
","
%rcx
","
%r8
","
%r9
","
%r10
");
...
...
@@ -647,6 +671,7 @@ my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"
my
$Xi
=
4
;
my
@X
=
map
("
%xmm
$_
",(
4
..
7
,
0
..
3
));
my
@Tx
=
map
("
%xmm
$_
",(
8
..
10
));
my
$Kx
=
$rndkey0
;
my
@V
=
(
$A
,
$B
,
$C
,
$D
,
$E
)
=
("
%eax
","
%ebx
","
%ecx
","
%edx
","
%ebp
");
# size optimization
my
@T
=
("
%esi
","
%edi
");
...
...
@@ -708,9 +733,12 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
mov $C,@T[1]
xor $D,@T[1]
and @T[1],@T[0]
vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
vmovdqa 0($K_XX_XX),
@Tx[1]
# K_00_19
vmovdqa 0($K_XX_XX),
$Kx
# K_00_19
vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
vmovdqu 16($inp),@X[-3&7]
vmovdqu 32($inp),@X[-2&7]
...
...
@@ -720,13 +748,13 @@ $code.=<<___;
vpshufb @X[2],@X[-3&7],@X[-3&7]
vpshufb @X[2],@X[-2&7],@X[-2&7]
vpshufb @X[2],@X[-1&7],@X[-1&7]
vpaddd
@Tx[1]
,@X[-4&7],@X[0] # add K_00_19
vpaddd
@Tx[1]
,@X[-3&7],@X[1]
vpaddd
@Tx[1]
,@X[-2&7],@X[2]
vpaddd
$Kx
,@X[-4&7],@X[0] # add K_00_19
vpaddd
$Kx
,@X[-3&7],@X[1]
vpaddd
$Kx
,@X[-2&7],@X[2]
vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
vmovdqa @X[1],16(%rsp)
vmovdqa @X[2],32(%rsp)
vmovups -112($key),$rndkey
0
# $key[0]
vmovups -112($key),$rndkey
[1]
# $key[0]
vmovups 16-112($key),$rndkey[0] # forward reference
jmp .Loop_avx
___
...
...
@@ -737,7 +765,7 @@ my $aesenc=sub {
if
(
$k
==
0
)
{
$code
.=
<<___;
vmovups `16*$n`($in0),$in # load input
vxorps $rndkey
0
,$in,$in
vxorps $rndkey
[1]
,$in,$in
___
$code
.=<<
___
if
(
$n
);
vmovups
$iv
,`
16*(
$n
-1)
`(
$out
,
$in0
)
# write output
...
...
@@ -763,6 +791,7 @@ ___
vmovups `32+16*($k+3)-112`($key),$rndkey[0]
.Lvaesenclast$sn:
vaesenclast $rndkey[0],$iv,$iv
vmovups -112($key),$rndkey[0]
vmovups 16-112($key),$rndkey[1] # forward reference
___
}
else
{
...
...
@@ -786,10 +815,10 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpaddd
(
@Tx
[
1
],
@Tx
[
1
]
,
@X
[
-
1
&
7
]);
&vpaddd
(
@Tx
[
1
],
$Kx
,
@X
[
-
1
&
7
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpsrldq
(
@Tx
[
0
],
@X
[
-
1
&
7
],
4
);
# "X[-3]", 3 dwords
&vpsrldq
(
@Tx
[
0
],
@X
[
-
1
&
7
],
4
);
# "X[-3]", 3 dwords
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpxor
(
@X
[
0
],
@X
[
0
],
@X
[
-
4
&
7
]);
# "X[0]"^="X[-16]"
...
...
@@ -839,7 +868,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
&vpxor
(
@X
[
0
],
@X
[
0
],
@Tx
[
2
]);
# "X[0]"^=("X[0]">>96)<<<2
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vmovdqa
(
@Tx
[
2
],
eval
(
16
*
((
$Xi
)
/
5
))
.
"
(
$K_XX_XX
)
"
);
# K_XX_XX
&vmovdqa
(
$Kx
,
eval
(
16
*
((
$Xi
)
/
5
))
.
"
(
$K_XX_XX
)
")
if
(
$Xi
%
5
==
0
);
# K_XX_XX
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
...
...
@@ -847,7 +876,6 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
foreach
(
@insns
)
{
eval
;
}
# remaining instructions [if any]
$Xi
++
;
push
(
@X
,
shift
(
@X
));
# "rotate" X[]
push
(
@Tx
,
shift
(
@Tx
));
}
sub
Xupdate_avx_32_79
()
...
...
@@ -866,12 +894,8 @@ sub Xupdate_avx_32_79()
&vpxor
(
@X
[
0
],
@X
[
0
],
@X
[
-
7
&
7
]);
# "X[0]"^="X[-28]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
))
if
(
@insns
[
0
]
!~
/&ro[rl]/
);
if
(
$Xi
%
5
)
{
&vmovdqa
(
@Tx
[
2
],
@Tx
[
1
]);
# "perpetuate" K_XX_XX...
}
else
{
# ... or load next one
&vmovdqa
(
@Tx
[
2
],
eval
(
16
*
(
$Xi
/
5
))
.
"
(
$K_XX_XX
)
");
}
&vpaddd
(
@Tx
[
1
],
@Tx
[
1
],
@X
[
-
1
&
7
]);
&vpaddd
(
@Tx
[
1
],
$Kx
,
@X
[
-
1
&
7
]);
&vmovdqa
(
$Kx
,
eval
(
16
*
(
$Xi
/
5
))
.
"
(
$K_XX_XX
)
")
if
(
$Xi
%
5
==
0
);
eval
(
shift
(
@insns
));
# ror
eval
(
shift
(
@insns
));
...
...
@@ -901,7 +925,6 @@ sub Xupdate_avx_32_79()
&vpor
(
@X
[
0
],
@X
[
0
],
@Tx
[
0
]);
# "X[0]"<<<=2
eval
(
shift
(
@insns
));
# body_20_39
eval
(
shift
(
@insns
));
&vmovdqa
(
@Tx
[
1
],
@X
[
0
])
if
(
$Xi
<
19
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# rol
eval
(
shift
(
@insns
));
...
...
@@ -912,7 +935,6 @@ sub Xupdate_avx_32_79()
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
$Xi
++
;
push
(
@X
,
shift
(
@X
));
# "rotate" X[]
push
(
@Tx
,
shift
(
@Tx
));
}
sub
Xuplast_avx_80
()
...
...
@@ -922,13 +944,13 @@ sub Xuplast_avx_80()
my
(
$a
,
$b
,
$c
,
$d
,
$e
);
eval
(
shift
(
@insns
));
&vpaddd
(
@Tx
[
1
],
@Tx
[
1
]
,
@X
[
-
1
&
7
]);
&vpaddd
(
@Tx
[
1
],
$Kx
,
@X
[
-
1
&
7
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&movdqa
(
eval
(
16
*
((
$Xi
-
1
)
&
3
))
.
"
(%rsp)
",
@Tx
[
1
]);
# X[]+K xfer IALU
&
v
movdqa
(
eval
(
16
*
((
$Xi
-
1
)
&
3
))
.
"
(%rsp)
",
@Tx
[
1
]);
# X[]+K xfer IALU
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
...
...
@@ -938,7 +960,7 @@ sub Xuplast_avx_80()
unshift
(
@Tx
,
pop
(
@Tx
));
&vmovdqa
(
@X
[
2
],"
64(
$K_XX_XX
)
");
# pbswap mask
&vmovdqa
(
@Tx
[
1
]
,"
0(
$K_XX_XX
)
");
# K_00_19
&vmovdqa
(
$Kx
,"
0(
$K_XX_XX
)
");
# K_00_19
&vmovdqu
(
@X
[
-
4
&
7
],"
0(
$inp
)
");
# load input
&vmovdqu
(
@X
[
-
3
&
7
],"
16(
$inp
)
");
&vmovdqu
(
@X
[
-
2
&
7
],"
32(
$inp
)
");
...
...
@@ -960,7 +982,7 @@ sub Xloop_avx()
&vpshufb
(
@X
[(
$Xi
-
3
)
&
7
],
@X
[(
$Xi
-
3
)
&
7
],
@X
[
2
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpaddd
(
@X
[
$Xi
&
7
],
@X
[(
$Xi
-
4
)
&
7
],
@Tx
[
1
]
);
&vpaddd
(
@X
[
$Xi
&
7
],
@X
[(
$Xi
-
4
)
&
7
],
$Kx
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
...
...
@@ -1024,8 +1046,11 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
mov $C,@T[1]
mov $D,12($ctx)
xor $D,@T[1]
mov $E,16($ctx)
and @T[1],@T[0]
jmp .Loop_avx
.align 16
...
...
crypto/sha/asm/sha1-x86_64.pl
浏览文件 @
7c9e81be
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@
fy.chalmers.se
> for the OpenSSL
# Written by Andy Polyakov <appro@
openssl.org
> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
...
...
@@ -55,14 +55,14 @@
#
# x86_64 SSSE3 AVX
# P4 9.8 -
# Opteron 6.6 -
# Core2 6.7
6.2/+8%
-
#
Atom 11.0 9.5/+15
% -
#
Westmere 7.1 5.5/+29% -
#
Sandy Bridge 7.9 6.2/+28% 5.1/+54
%
#
Ivy Bridge 6.4 4.7/+35% 4.6/+37
%
#
Bulldozer 10.9 6.0/+82
%
#
VIA Nano 10.2 7.4/+38
%
# Opteron 6.6
5
-
# Core2 6.7
0 6.05/+11%
-
#
Westmere 7.08 5.49/+29
% -
#
Sandy Bridge 7.93 6.16/+28% 4.99/+59%
#
Ivy Bridge 6.30 4.63/+36% 4.60/+37
%
#
Bulldozer 10.9 5.95/+82
%
#
VIA Nano 10.2 7.46/+37
%
#
Atom 11.0 9.61/+14
%
$flavour
=
shift
;
$output
=
shift
;
...
...
@@ -298,9 +298,11 @@ ___
my
$Xi
=
4
;
my
@X
=
map
("
%xmm
$_
",(
4
..
7
,
0
..
3
));
my
@Tx
=
map
("
%xmm
$_
",(
8
..
10
));
my
$Kx
=
"
%xmm11
";
my
@V
=
(
$A
,
$B
,
$C
,
$D
,
$E
)
=
("
%eax
","
%ebx
","
%ecx
","
%edx
","
%ebp
");
# size optimization
my
@T
=
("
%esi
","
%edi
");
my
$j
=
0
;
my
$rx
=
0
;
my
$K_XX_XX
=
"
%r11
";
my
$_rol
=
sub
{
&rol
(
@
_
)
};
...
...
@@ -314,7 +316,7 @@ _ssse3_shortcut:
push %rbx
push %rbp
push %r12
lea `-64-($win64?
5
*16:0)`(%rsp),%rsp
lea `-64-($win64?
6
*16:0)`(%rsp),%rsp
___
$code
.=<<
___
if
(
$win64
);
movaps
%xmm6
,
64
+
0
(
%rsp
)
...
...
@@ -322,6 +324,7 @@ $code.=<<___ if ($win64);
movaps
%xmm8
,
64
+
32
(
%rsp
)
movaps
%xmm9
,
64
+
48
(
%rsp
)
movaps
%xmm10
,
64
+
64
(
%rsp
)
movaps
%xmm11
,
64
+
80
(
%rsp
)
.
Lprologue_ssse3:
___
$code
.=
<<___;
...
...
@@ -339,6 +342,9 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
mov $C,@T[1]
xor $D,@T[1]
and @T[1],@T[0]
movdqa 64($K_XX_XX),@X[2] # pbswap mask
movdqa 0($K_XX_XX),@Tx[1] # K_00_19
...
...
@@ -586,50 +592,66 @@ sub Xtail_ssse3()
foreach
(
@insns
)
{
eval
;
}
}
sub
body_00_19
()
{
sub
body_00_19
()
{
# ((c^d)&b)^d
# on start @T[0]=(c^d)&b
return
&body_20_39
()
if
(
$rx
==
19
);
$rx
++
;
(
'
($a,$b,$c,$d,$e)=@V;
'
.
'
&add ($e,eval(4*($j&15))."(%rsp)");
',
# X[]+K xfer
'
&xor ($c,$d);
',
'
&mov (@T[1],$a);
',
# $b in next round
'
&$_rol ($a,5);
',
'
&and (@T[0],$c);
',
# ($b&($c^$d))
'
&xor ($c,$d);
',
# restore $c
'
&xor (@T[0],$d);
',
'
&add ($e,$a);
',
'
&$_ror ($b,$j?7:2);
',
# $b>>>2
'
&add ($e,@T[0]);
'
.
'
$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
'
&$_ror ($b,$j?7:2)
',
# $b>>>2
'
&xor (@T[0],$d)
',
'
&mov (@T[1],$a)
',
# $b for next round
'
&add ($e,eval(4*($j&15))."(%rsp)")
',
# X[]+K xfer
'
&xor ($b,$c)
',
# $c^$d for next round
'
&$_rol ($a,5)
',
'
&add ($e,@T[0])
',
'
&and (@T[1],$b)
',
# ($b&($c^$d)) for next round
'
&xor ($b,$c)
',
# restore $b
'
&add ($e,$a);
'
.
'
$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
);
}
sub
body_20_39
()
{
sub
body_20_39
()
{
# b^d^c
# on entry @T[0]=b^d
return
&body_40_59
()
if
(
$rx
==
39
);
$rx
++
;
(
'
($a,$b,$c,$d,$e)=@V;
'
.
'
&add ($e,eval(4*($j++&15))."(%rsp)");
',
# X[]+K xfer
'
&xor (@T[0],$d);
',
# ($b^$d)
'
&mov (@T[1],$a);
',
# $b in next round
'
&$_rol ($a,5);
',
'
&xor (@T[0],$c);
',
# ($b^$d^$c)
'
&add ($e,$a);
',
'
&$_ror ($b,7);
',
# $b>>>2
'
&add ($e,@T[0]);
'
.
'
unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
'
&add ($e,eval(4*($j&15))."(%rsp)")
',
# X[]+K xfer
'
&xor (@T[0],$d) if($j==19);
'
.
'
&xor (@T[0],$c) if($j> 19)
',
# ($b^$d^$c)
'
&mov (@T[1],$a)
',
# $b for next round
'
&$_rol ($a,5)
',
'
&add ($e,@T[0])
',
'
&xor (@T[1],$c) if ($j< 79)
',
# $b^$d for next round
'
&$_ror ($b,7)
',
# $b>>>2
'
&add ($e,$a);
'
.
'
$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
);
}
sub
body_40_59
()
{
sub
body_40_59
()
{
# ((b^c)&(c^d))^c
# on entry @T[0]=(b^c), (c^=d)
$rx
++
;
(
'
($a,$b,$c,$d,$e)=@V;
'
.
'
&xor (@T[0],$c);
',
'
&xor (@T[1],$d);
',
'
&add ($e,eval(4*($j++&15))."(%rsp)");
',
# X[]+K xfer
'
&and (@T[0],$T[1]);
',
'
&$_ror ($b,7);
',
# $b>>>2
'
&xor (@T[0],$c);
',
'
&mov (@T[1],$a);
',
# $b in next round
'
&$_rol ($a,5);
',
'
&add ($e,@T[0]);
',
'
&mov (@T[0],$b);
',
# copy of $c in next round
'
&add ($e,$a);
'
.
'
unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
'
&add ($e,eval(4*($j&15))."(%rsp)")
',
# X[]+K xfer
'
&and (@T[0],$c) if ($j>=40)
',
# (b^c)&(c^d)
'
&xor ($c,$d) if ($j>=40)
',
# restore $c
'
&$_ror ($b,7)
',
# $b>>>2
'
&mov (@T[1],$a)
',
# $b for next round
'
&xor (@T[0],$c)
',
'
&$_rol ($a,5)
',
'
&add ($e,@T[0])
',
'
&xor (@T[1],$c) if ($j==59);
'
.
'
&xor (@T[1],$b) if ($j< 59)
',
# b^c for next round
'
&xor ($b,$c) if ($j< 59)
',
# c^d for next round
'
&add ($e,$a);
'
.
'
$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));
'
);
}
$code
.=
<<___;
...
...
@@ -646,7 +668,6 @@ ___
&Xupdate_ssse3_32_79
(
\
&body_20_39
);
&Xupdate_ssse3_32_79
(
\
&body_20_39
);
&Xupdate_ssse3_32_79
(
\
&body_20_39
);
&mov
(
@T
[
1
],
@V
[
2
]);
# copy of $c in next round
&Xupdate_ssse3_32_79
(
\
&body_40_59
);
&Xupdate_ssse3_32_79
(
\
&body_40_59
);
&Xupdate_ssse3_32_79
(
\
&body_40_59
);
...
...
@@ -671,8 +692,11 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
mov $C,@T[1]
mov $D,12($ctx)
xor $D,@T[1]
mov $E,16($ctx)
and @T[1],@T[0]
jmp .Loop_ssse3
.align 16
...
...
@@ -702,9 +726,10 @@ $code.=<<___ if ($win64);
movaps
64
+
32
(
%rsp
),
%xmm8
movaps
64
+
48
(
%rsp
),
%xmm9
movaps
64
+
64
(
%rsp
),
%xmm10
movaps
64
+
80
(
%rsp
),
%xmm11
___
$code
.=
<<___;
lea `64+($win64?
5
*16:0)`(%rsp),%rsi
lea `64+($win64?
6
*16:0)`(%rsp),%rsi
mov 0(%rsi),%r12
mov 8(%rsi),%rbp
mov 16(%rsi),%rbx
...
...
@@ -715,13 +740,13 @@ $code.=<<___;
___
if
(
$avx
)
{
my
$Xi
=
4
;
my
@X
=
map
("
%xmm
$_
",(
4
..
7
,
0
..
3
));
my
@Tx
=
map
("
%xmm
$_
",(
8
..
10
));
my
@V
=
(
$A
,
$B
,
$C
,
$D
,
$E
)
=
("
%eax
","
%ebx
","
%ecx
","
%edx
","
%ebp
");
# size optimization
my
@T
=
("
%esi
","
%edi
")
;
my
$j
=
0
;
my
$
K_XX_XX
=
"
%r11
";
$Xi
=
4
;
# reset variables
@X
=
map
("
%xmm
$_
",(
4
..
7
,
0
..
3
));
@Tx
=
map
("
%xmm
$_
",(
8
..
10
));
$j
=
0
;
$rx
=
0
;
my
$
done_avx_label
=
"
.Ldone_avx
";
my
$_rol
=
sub
{
&shld
(
@_
[
0
],
@
_
)
};
my
$_ror
=
sub
{
&shrd
(
@_
[
0
],
@
_
)
};
...
...
@@ -734,7 +759,7 @@ _avx_shortcut:
push %rbx
push %rbp
push %r12
lea `-64-($win64?
5
*16:0)`(%rsp),%rsp
lea `-64-($win64?
6
*16:0)`(%rsp),%rsp
___
$code
.=<<
___
if
(
$win64
);
movaps
%xmm6
,
64
+
0
(
%rsp
)
...
...
@@ -742,6 +767,7 @@ $code.=<<___ if ($win64);
movaps
%xmm8
,
64
+
32
(
%rsp
)
movaps
%xmm9
,
64
+
48
(
%rsp
)
movaps
%xmm10
,
64
+
64
(
%rsp
)
movaps
%xmm11
,
64
+
80
(
%rsp
)
.
Lprologue_avx:
___
$code
.=
<<___;
...
...
@@ -760,9 +786,12 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
mov $C,@T[1]
xor $D,@T[1]
and @T[1],@T[0]
vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
vmovdqa 0($K_XX_XX),
@Tx[1]
# K_00_19
vmovdqa 0($K_XX_XX),
$Kx
# K_00_19
vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
vmovdqu 16($inp),@X[-3&7]
vmovdqu 32($inp),@X[-2&7]
...
...
@@ -772,9 +801,9 @@ $code.=<<___;
vpshufb @X[2],@X[-3&7],@X[-3&7]
vpshufb @X[2],@X[-2&7],@X[-2&7]
vpshufb @X[2],@X[-1&7],@X[-1&7]
vpaddd
@Tx[1]
,@X[-4&7],@X[0] # add K_00_19
vpaddd
@Tx[1]
,@X[-3&7],@X[1]
vpaddd
@Tx[1]
,@X[-2&7],@X[2]
vpaddd
$Kx
,@X[-4&7],@X[0] # add K_00_19
vpaddd
$Kx
,@X[-3&7],@X[1]
vpaddd
$Kx
,@X[-2&7],@X[2]
vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
vmovdqa @X[1],16(%rsp)
vmovdqa @X[2],32(%rsp)
...
...
@@ -793,10 +822,10 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpaddd
(
@Tx
[
1
],
@Tx
[
1
]
,
@X
[
-
1
&
7
]);
&vpaddd
(
@Tx
[
1
],
$Kx
,
@X
[
-
1
&
7
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpsrldq
(
@Tx
[
0
],
@X
[
-
1
&
7
],
4
);
# "X[-3]", 3 dwords
&vpsrldq
(
@Tx
[
0
],
@X
[
-
1
&
7
],
4
);
# "X[-3]", 3 dwords
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpxor
(
@X
[
0
],
@X
[
0
],
@X
[
-
4
&
7
]);
# "X[0]"^="X[-16]"
...
...
@@ -846,7 +875,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
&vpxor
(
@X
[
0
],
@X
[
0
],
@Tx
[
2
]);
# "X[0]"^=("X[0]">>96)<<<2
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vmovdqa
(
@Tx
[
2
],
eval
(
16
*
((
$Xi
)
/
5
))
.
"
(
$K_XX_XX
)
"
);
# K_XX_XX
&vmovdqa
(
$Kx
,
eval
(
16
*
((
$Xi
)
/
5
))
.
"
(
$K_XX_XX
)
")
if
(
$Xi
%
5
==
0
);
# K_XX_XX
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
...
...
@@ -854,7 +883,6 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
foreach
(
@insns
)
{
eval
;
}
# remaining instructions [if any]
$Xi
++
;
push
(
@X
,
shift
(
@X
));
# "rotate" X[]
push
(
@Tx
,
shift
(
@Tx
));
}
sub
Xupdate_avx_32_79
()
...
...
@@ -873,12 +901,8 @@ sub Xupdate_avx_32_79()
&vpxor
(
@X
[
0
],
@X
[
0
],
@X
[
-
7
&
7
]);
# "X[0]"^="X[-28]"
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
))
if
(
@insns
[
0
]
!~
/&ro[rl]/
);
if
(
$Xi
%
5
)
{
&vmovdqa
(
@Tx
[
2
],
@Tx
[
1
]);
# "perpetuate" K_XX_XX...
}
else
{
# ... or load next one
&vmovdqa
(
@Tx
[
2
],
eval
(
16
*
(
$Xi
/
5
))
.
"
(
$K_XX_XX
)
");
}
&vpaddd
(
@Tx
[
1
],
@Tx
[
1
],
@X
[
-
1
&
7
]);
&vpaddd
(
@Tx
[
1
],
$Kx
,
@X
[
-
1
&
7
]);
&vmovdqa
(
$Kx
,
eval
(
16
*
(
$Xi
/
5
))
.
"
(
$K_XX_XX
)
")
if
(
$Xi
%
5
==
0
);
eval
(
shift
(
@insns
));
# ror
eval
(
shift
(
@insns
));
...
...
@@ -908,7 +932,6 @@ sub Xupdate_avx_32_79()
&vpor
(
@X
[
0
],
@X
[
0
],
@Tx
[
0
]);
# "X[0]"<<<=2
eval
(
shift
(
@insns
));
# body_20_39
eval
(
shift
(
@insns
));
&vmovdqa
(
@Tx
[
1
],
@X
[
0
])
if
(
$Xi
<
19
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
# rol
eval
(
shift
(
@insns
));
...
...
@@ -919,7 +942,6 @@ sub Xupdate_avx_32_79()
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
$Xi
++
;
push
(
@X
,
shift
(
@X
));
# "rotate" X[]
push
(
@Tx
,
shift
(
@Tx
));
}
sub
Xuplast_avx_80
()
...
...
@@ -929,23 +951,21 @@ sub Xuplast_avx_80()
my
(
$a
,
$b
,
$c
,
$d
,
$e
);
eval
(
shift
(
@insns
));
&vpaddd
(
@Tx
[
1
],
@Tx
[
1
]
,
@X
[
-
1
&
7
]);
&vpaddd
(
@Tx
[
1
],
$Kx
,
@X
[
-
1
&
7
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&movdqa
(
eval
(
16
*
((
$Xi
-
1
)
&
3
))
.
"
(%rsp)
",
@Tx
[
1
]);
# X[]+K xfer IALU
&
v
movdqa
(
eval
(
16
*
((
$Xi
-
1
)
&
3
))
.
"
(%rsp)
",
@Tx
[
1
]);
# X[]+K xfer IALU
foreach
(
@insns
)
{
eval
;
}
# remaining instructions
&cmp
(
$inp
,
$num
);
&je
("
.Ldone_avx
");
unshift
(
@Tx
,
pop
(
@Tx
));
&je
(
$done_avx_label
);
&vmovdqa
(
@X
[
2
],"
64(
$K_XX_XX
)
");
# pbswap mask
&vmovdqa
(
@Tx
[
1
]
,"
0(
$K_XX_XX
)
");
# K_00_19
&vmovdqa
(
$Kx
,"
0(
$K_XX_XX
)
");
# K_00_19
&vmovdqu
(
@X
[
-
4
&
7
],"
0(
$inp
)
");
# load input
&vmovdqu
(
@X
[
-
3
&
7
],"
16(
$inp
)
");
&vmovdqu
(
@X
[
-
2
&
7
],"
32(
$inp
)
");
...
...
@@ -967,7 +987,7 @@ sub Xloop_avx()
&vpshufb
(
@X
[(
$Xi
-
3
)
&
7
],
@X
[(
$Xi
-
3
)
&
7
],
@X
[
2
]);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
&vpaddd
(
@X
[
$Xi
&
7
],
@X
[(
$Xi
-
4
)
&
7
],
@Tx
[
1
]
);
&vpaddd
(
@X
[
$Xi
&
7
],
@X
[(
$Xi
-
4
)
&
7
],
$Kx
);
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
eval
(
shift
(
@insns
));
...
...
@@ -1003,7 +1023,6 @@ ___
&Xupdate_avx_32_79
(
\
&body_20_39
);
&Xupdate_avx_32_79
(
\
&body_20_39
);
&Xupdate_avx_32_79
(
\
&body_20_39
);
&mov
(
@T
[
1
],
@V
[
2
]);
# copy of $c in next round
&Xupdate_avx_32_79
(
\
&body_40_59
);
&Xupdate_avx_32_79
(
\
&body_40_59
);
&Xupdate_avx_32_79
(
\
&body_40_59
);
...
...
@@ -1028,12 +1047,15 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
mov $C,@T[1]
mov $D,12($ctx)
xor $D,@T[1]
mov $E,16($ctx)
and @T[1],@T[0]
jmp .Loop_avx
.align 16
.Ldone_avx
:
$done_avx_label
:
___
$j
=
$saved_j
;
@V
=
@saved_V
;
...
...
@@ -1061,9 +1083,10 @@ $code.=<<___ if ($win64);
movaps
64
+
32
(
%rsp
),
%xmm8
movaps
64
+
48
(
%rsp
),
%xmm9
movaps
64
+
64
(
%rsp
),
%xmm10
movaps
64
+
80
(
%rsp
),
%xmm11
___
$code
.=
<<___;
lea `64+($win64?
5
*16:0)`(%rsp),%rsi
lea `64+($win64?
6
*16:0)`(%rsp),%rsi
mov 0(%rsi),%r12
mov 8(%rsi),%rbp
mov 16(%rsi),%rbx
...
...
@@ -1174,9 +1197,9 @@ ssse3_handler:
lea 64(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$1
0
,%ecx
mov \$1
2
,%ecx
.long 0xa548f3fc # cld; rep movsq
lea `24+64+
5
*16`(%rax),%rax # adjust stack pointer
lea `24+64+
6
*16`(%rax),%rax # adjust stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录