Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
36df342f
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
大约 1 年 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
36df342f
编写于
5月 25, 2013
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
aesni-x86_64.pl: optimize XTS.
PR: 3042
上级
4df2280b
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
338 addition
and
258 deletion
+338
-258
crypto/aes/asm/aesni-x86_64.pl
crypto/aes/asm/aesni-x86_64.pl
+338
-258
未找到文件。
crypto/aes/asm/aesni-x86_64.pl
浏览文件 @
36df342f
...
...
@@ -153,14 +153,14 @@
# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.
33
cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.9
7
. Just like
# Add aesni_xts_[en|de]crypt. Westmere spends 1.
25
cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.9
0
. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
######################################################################
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
# in ECB, 0.71 in CTR, 0.9
5
in XTS... This means that aes[enc|dec]
# in ECB, 0.71 in CTR, 0.9
0
in XTS... This means that aes[enc|dec]
# instruction latency is 9 cycles and that they can be issued every
# cycle.
...
...
@@ -1430,7 +1430,7 @@ ___
my
@tweak
=
map
("
%xmm
$_
",(
10
..
15
));
my
(
$twmask
,
$twres
,
$twtmp
)
=
("
%xmm8
","
%xmm9
",
@tweak
[
4
]);
my
(
$key2
,
$ivp
,
$len_
)
=
("
%r8
","
%r9
","
%r9
");
my
$frame_size
=
0x
6
0
+
(
$win64
?
160
:
0
);
my
$frame_size
=
0x
7
0
+
(
$win64
?
160
:
0
);
$code
.=
<<___;
.globl aesni_xts_encrypt
...
...
@@ -1464,213 +1464,251 @@ ___
# generate the tweak
&aesni_generate1
("
enc
",
$key2
,
$rounds
,
@tweak
[
5
]);
$code
.=
<<___;
$movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
$movkey 16($key,$rnds_),$rndkey1 # last round key
mov $rounds,$rnds_
movdqa .Lxts_magic(%rip),$twmask
p
xor $twtmp,$twtmp
p
cmpgtd @tweak[5],$twtmp # broadcast upper bits
p
shufd \$0x5f,@tweak[5],$twres
p
xor $rndkey0,$rndkey1
___
# alternative tweak calculation algorithm is based on suggestions
# by Shay Gueron. psrad doesn't conflict with AES-NI instructions
# and should help in the future...
for
(
$i
=
0
;
$i
<
4
;
$i
++
)
{
$code
.=
<<___;
pshufd \$0x13,$twtmp,$twres
p
xor $twtmp,$twtmp
movdqa $twres,$twtmp
p
addd $twres,$twres
movdqa @tweak[5],@tweak[$i]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
pand $twmask,$twres # isolate carry and residue
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
pxor $twres,@tweak[5]
psrad \$31,$twtmp # broadcast upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
pxor $rndkey0,@tweak[$i]
pxor $twtmp,@tweak[5]
___
}
$code
.=
<<___;
movdqa @tweak[5],@tweak[4]
psrad \$31,$twres
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
pxor $rndkey0,@tweak[4]
pxor $twres,@tweak[5]
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
sub \$16*6,$len
jc .Lxts_enc_short
shr \$1,$rounds
sub \$1,$rounds
sub \$3,$rounds
$movkey 16($key_),$rndkey1
mov $rounds,$rnds_
lea .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
.align
16
.align
32
.Lxts_enc_grandloop:
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
pand $twmask,$twres # isolate carry and residue
movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
pxor $twres,@tweak[5]
pxor @tweak[0],$inout0
movdqu `16*2`($inp),$inout2
pxor @tweak[0],$inout0 # input^=tweak
movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
movdqu `16*4`($inp),$inout4
aesenc $rndkey1,$inout0
movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
movdqu `16*5`($inp),$inout5
lea `16*6`($inp),$inp
aesenc $rndkey1,$inout1
movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
$movkey ($key_),$rndkey0
aesenc $rndkey1,$inout2
movdqu `16*5`($inp),$inout5
pxor @tweak[5],$twmask # round[0]^=tweak[5]
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
pxor @tweak[5],$inout5
aesenc $rndkey1,$inout3
$movkey 32($key_),$rndkey0
lea `16*6`($inp),$inp
pxor $twmask,$inout5
# inline _aesni_encrypt6 and interleave first and last rounds
# with own code...
$movkey 16($key_),$rndkey1
pxor $rndkey0,$inout0
pxor $rndkey0,$inout1
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
aesenc $rndkey1,$inout0
lea 32($key_),$key
pxor $rndkey0,$inout2
movdqa @tweak[1],`16*1`(%rsp)
aesenc $rndkey1,$inout1
pxor $rndkey0,$inout3
movdqa @tweak[2],`16*2`(%rsp)
aesenc $rndkey1,$inout2
pxor $rndkey0,$inout4
movdqa @tweak[3],`16*3`(%rsp)
aesenc $rndkey1,$inout3
pxor $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
movdqa @tweak[4],`16*4`(%rsp)
pxor $twres,@tweak[0]
aesenc $rndkey1,$inout4
movdqa @tweak[5],`16*5`(%rsp)
pxor $twres,@tweak[1]
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesenc $rndkey1,$inout5
pxor $twtmp,$twtmp
pcmpgtd @tweak[5],$twtmp
jmp .Lxts_enc_loop6_enter
$movkey 48($key_),$rndkey1
.align 16
aesenc $rndkey0,$inout0
pxor $twres,@tweak[2]
movdqa @tweak[1],`16*1`(%rsp)
aesenc $rndkey0,$inout1
pxor $twres,@tweak[3]
movdqa @tweak[2],`16*2`(%rsp)
aesenc $rndkey0,$inout2
pxor $twres,@tweak[4]
aesenc $rndkey0,$inout3
pxor $twres,$twmask
movdqa @tweak[4],`16*4`(%rsp)
aesenc $rndkey0,$inout4
movdqa $twmask,`16*5`(%rsp)
aesenc $rndkey0,$inout5
$movkey 64($key_),$rndkey0
lea 64($key_),$key
pshufd \$0x5f,@tweak[5],$twres
jmp .Lxts_enc_loop6
.align 32
.Lxts_enc_loop6:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
.Lxts_enc_loop6_enter:
$movkey 16($key),$rndkey1
lea 32($key),$key
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
jnz .Lxts_enc_loop6
pshufd \$0x13,$twtmp,$twres
pxor $twtmp
,$twtmp
padd
q @tweak[5],@tweak[5] # psllq 1,$tweak
movdqa (%r8),$twmask
movdqa $twres
,$twtmp
padd
d $twres,$twres
aesenc $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
paddq @tweak[5],@tweak[5]
psrad \$31,$twtmp
aesenc $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
pand $twmask,$twtmp
$movkey ($key_),@tweak[0] # load round[0]
aesenc $rndkey1,$inout2
pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
movaps @tweak[0],@tweak[1] # copy round[0]
aesenc $rndkey1,$inout5
$movkey 16($key),$rndkey1
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[0]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqa $twres,$twtmp
paddd $twres,$twres
aesenc $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
pxor @tweak[5],@tweak[0]
psrad \$31,$twtmp
aesenc $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesenc $rndkey0,$inout2
pxor $twres,@tweak[5]
aesenc $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey0,$inout4
movaps @tweak[1],@tweak[2]
aesenc $rndkey0,$inout5
$movkey 32($key),$rndkey0
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[1]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqa $twres,$twtmp
paddd $twres,$twres
aesenc $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
pxor @tweak[5],@tweak[1]
psrad \$31,$twtmp
aesenc $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesenc $rndkey1,$inout2
pxor $twres,@tweak[5]
movdqa @tweak[3],`16*3`(%rsp)
aesenc $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
movaps @tweak[2],@tweak[3]
aesenc $rndkey1,$inout5
$movkey 48($key),$rndkey1
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[2]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesenclast $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
aesenclast $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesenclast $rndkey0,$inout2
pxor $twres,@tweak[5]
aesenclast $rndkey0,$inout3
aesenclast $rndkey0,$inout4
aesenclast $rndkey0,$inout5
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[3]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
xorps `16*0`(%rsp),$inout0 # output^=tweak
pand $twmask,$twres # isolate carry and residue
xorps `16*1`(%rsp),$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
movdqa $twres,$twtmp
paddd $twres,$twres
aesenc $rndkey0,$inout0
pxor @tweak[5],@tweak[2]
psrad \$31,$twtmp
aesenc $rndkey0,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey0,$inout4
movaps @tweak[3],@tweak[4]
aesenc $rndkey0,$inout5
movdqa $twres,$rndkey0
paddd $twres,$twres
aesenc $rndkey1,$inout0
pxor @tweak[5],@tweak[3]
psrad \$31,$rndkey0
aesenc $rndkey1,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$rndkey0
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
pxor $rndkey0,@tweak[5]
$movkey ($key_),$rndkey0
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
$movkey 16($key_),$rndkey1
pxor @tweak[5],@tweak[4]
psrad \$31,$twres
aesenclast `16*0`(%rsp),$inout0
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
aesenclast `16*1`(%rsp),$inout1
aesenclast `16*2`(%rsp),$inout2
pxor $twres,@tweak[5]
aesenclast `16*3`(%rsp),$inout3
aesenclast `16*4`(%rsp),$inout4
aesenclast `16*5`(%rsp),$inout5
mov $rnds_,$rounds # restore $rounds
xorps `16*2`(%rsp),$inout2
movups $inout0,`16*0`($out) # write output
xorps `16*3`(%rsp),$inout3
movups $inout1,`16*1`($out)
xorps `16*4`(%rsp),$inout4
movups $inout2,`16*2`($out)
xorps `16*5`(%rsp),$inout5
movups $inout3,`16*3`($out)
mov $rnds_,$rounds # restore $rounds
movups $inout4,`16*4`($out)
movups $inout5,`16*5`($out)
lea `16*6`($out),$out
movups $inout0,`-16*6`($out) # write output
movups $inout1,`-16*5`($out)
movups $inout2,`-16*4`($out)
movups $inout3,`-16*3`($out)
movups $inout4,`-16*2`($out)
movups $inout5,`-16*1`($out)
sub \$16*6,$len
jnc .Lxts_enc_grandloop
lea
3
($rounds,$rounds),$rounds # restore original value
lea
7
($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_enc_short:
pxor $rndkey0,@tweak[0]
add \$16*6,$len
jz .Lxts_enc_done
pxor $rndkey0,@tweak[1]
cmp \$0x20,$len
jb .Lxts_enc_one
pxor $rndkey0,@tweak[2]
je .Lxts_enc_two
pxor $rndkey0,@tweak[3]
cmp \$0x40,$len
jb .Lxts_enc_three
pxor $rndkey0,@tweak[4]
je .Lxts_enc_four
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu ($inp),$inout0
pand $twmask,$twres # isolate carry and residue
movdqu 16*1($inp),$inout1
pxor $twres,@tweak[5]
movdqu ($inp),$inout0
movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
...
...
@@ -1765,15 +1803,15 @@ $code.=<<___;
call _aesni_encrypt4
xorps
@tweak[0],$inout0
movdqa @tweak[
5
],@tweak[0]
xorps
@tweak[1],$inout1
xorps
@tweak[2],$inout2
mov
ups
$inout0,($out)
xorps
@tweak[3],$inout3
mov
ups
$inout1,16*1($out)
mov
ups
$inout2,16*2($out)
mov
ups
$inout3,16*3($out)
pxor
@tweak[0],$inout0
movdqa @tweak[
4
],@tweak[0]
pxor
@tweak[1],$inout1
pxor
@tweak[2],$inout2
mov
dqu
$inout0,($out)
pxor
@tweak[3],$inout3
mov
dqu
$inout1,16*1($out)
mov
dqu
$inout2,16*2($out)
mov
dqu
$inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_enc_done
...
...
@@ -1865,213 +1903,248 @@ $code.=<<___;
shl \$4,%rax
sub %rax,$len
$movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
$movkey 16($key,$rnds_),$rndkey1 # last round key
mov $rounds,$rnds_
movdqa .Lxts_magic(%rip),$twmask
p
xor $twtmp,$twtmp
p
cmpgtd @tweak[5],$twtmp # broadcast upper bits
p
shufd \$0x5f,@tweak[5],$twres
p
xor $rndkey0,$rndkey1
___
for
(
$i
=
0
;
$i
<
4
;
$i
++
)
{
$code
.=
<<___;
pshufd \$0x13,$twtmp,$twres
p
xor $twtmp,$twtmp
movdqa $twres,$twtmp
p
addd $twres,$twres
movdqa @tweak[5],@tweak[$i]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
pand $twmask,$twres # isolate carry and residue
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
pxor $twres,@tweak[5]
psrad \$31,$twtmp # broadcast upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
pxor $rndkey0,@tweak[$i]
pxor $twtmp,@tweak[5]
___
}
$code
.=
<<___;
movdqa @tweak[5],@tweak[4]
psrad \$31,$twres
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
pxor $rndkey0,@tweak[4]
pxor $twres,@tweak[5]
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
sub \$16*6,$len
jc .Lxts_dec_short
shr \$1,$rounds
sub \$1,$rounds
sub \$3,$rounds
$movkey 16($key_),$rndkey1
mov $rounds,$rnds_
lea .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
.align
16
.align
32
.Lxts_dec_grandloop:
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
pand $twmask,$twres # isolate carry and residue
movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
pxor $twres,@tweak[5]
pxor @tweak[0],$inout0
movdqu `16*2`($inp),$inout2
pxor @tweak[0],$inout0 # input^=tweak
movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
movdqu `16*4`($inp),$inout4
aesdec $rndkey1,$inout0
movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
movdqu `16*5`($inp),$inout5
lea `16*6`($inp),$inp
aesdec $rndkey1,$inout1
movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
$movkey ($key_),$rndkey0
aesdec $rndkey1,$inout2
movdqu `16*5`($inp),$inout5
pxor @tweak[5],$twmask # round[0]^=tweak[5]
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
pxor @tweak[5],$inout5
aesdec $rndkey1,$inout3
$movkey 32($key_),$rndkey0
lea `16*6`($inp),$inp
pxor $twmask,$inout5
# inline _aesni_decrypt6 and interleave first and last rounds
# with own code...
$movkey 16($key_),$rndkey1
pxor $rndkey0,$inout0
pxor $rndkey0,$inout1
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
aesdec $rndkey1,$inout0
lea 32($key_),$key
pxor $rndkey0,$inout2
movdqa @tweak[1],`16*1`(%rsp)
aesdec $rndkey1,$inout1
pxor $rndkey0,$inout3
movdqa @tweak[2],`16*2`(%rsp)
aesdec $rndkey1,$inout2
pxor $rndkey0,$inout4
movdqa @tweak[3],`16*3`(%rsp)
aesdec $rndkey1,$inout3
pxor $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
movdqa @tweak[4],`16*4`(%rsp)
pxor $twres,@tweak[0]
aesdec $rndkey1,$inout4
movdqa @tweak[5],`16*5`(%rsp)
pxor $twres,@tweak[1]
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesdec $rndkey1,$inout5
pxor $twtmp,$twtmp
pcmpgtd @tweak[5],$twtmp
jmp .Lxts_dec_loop6_enter
$movkey 48($key_),$rndkey1
.align 16
aesdec $rndkey0,$inout0
pxor $twres,@tweak[2]
movdqa @tweak[1],`16*1`(%rsp)
aesdec $rndkey0,$inout1
pxor $twres,@tweak[3]
movdqa @tweak[2],`16*2`(%rsp)
aesdec $rndkey0,$inout2
pxor $twres,@tweak[4]
aesdec $rndkey0,$inout3
pxor $twres,$twmask
movdqa @tweak[4],`16*4`(%rsp)
aesdec $rndkey0,$inout4
movdqa $twmask,`16*5`(%rsp)
aesdec $rndkey0,$inout5
$movkey 64($key_),$rndkey0
lea 64($key_),$key
pshufd \$0x5f,@tweak[5],$twres
jmp .Lxts_dec_loop6
.align 32
.Lxts_dec_loop6:
aesdec $rndkey1,$inout0
aesdec $rndkey1,$inout1
dec $rounds
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
.Lxts_dec_loop6_enter:
$movkey 16($key),$rndkey1
lea 32($key),$key
aesdec $rndkey0,$inout0
aesdec $rndkey0,$inout1
lea 32($key),$key
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
aesdec $rndkey0,$inout4
aesdec $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
jnz .Lxts_dec_loop6
pshufd \$0x13,$twtmp,$twres
pxor $twtmp
,$twtmp
padd
q @tweak[5],@tweak[5] # psllq 1,$tweak
movdqa (%r8),$twmask
movdqa $twres
,$twtmp
padd
d $twres,$twres
aesdec $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
paddq @tweak[5],@tweak[5]
psrad \$31,$twtmp
aesdec $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
pand $twmask,$twtmp
$movkey ($key_),@tweak[0] # load round[0]
aesdec $rndkey1,$inout2
pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
movaps @tweak[0],@tweak[1] # copy round[0]
aesdec $rndkey1,$inout5
$movkey 16($key),$rndkey1
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[0]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqa $twres,$twtmp
paddd $twres,$twres
aesdec $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
pxor @tweak[5],@tweak[0]
psrad \$31,$twtmp
aesdec $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesdec $rndkey0,$inout2
pxor $twres,@tweak[5]
aesdec $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey0,$inout4
movaps @tweak[1],@tweak[2]
aesdec $rndkey0,$inout5
$movkey 32($key),$rndkey0
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[1]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqa $twres,$twtmp
paddd $twres,$twres
aesdec $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
pxor @tweak[5],@tweak[1]
psrad \$31,$twtmp
aesdec $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesdec $rndkey1,$inout2
pxor $twres,@tweak[5]
movdqa @tweak[3],`16*3`(%rsp)
aesdec $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
movaps @tweak[2],@tweak[3]
aesdec $rndkey1,$inout5
$movkey 48($key),$rndkey1
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[2]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesdeclast $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
aesdeclast $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesdeclast $rndkey0,$inout2
pxor $twres,@tweak[5]
aesdeclast $rndkey0,$inout3
aesdeclast $rndkey0,$inout4
aesdeclast $rndkey0,$inout5
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[3]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
xorps `16*0`(%rsp),$inout0 # output^=tweak
pand $twmask,$twres # isolate carry and residue
xorps `16*1`(%rsp),$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
movdqa $twres,$twtmp
paddd $twres,$twres
aesdec $rndkey0,$inout0
pxor @tweak[5],@tweak[2]
psrad \$31,$twtmp
aesdec $rndkey0,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey0,$inout4
movaps @tweak[3],@tweak[4]
aesdec $rndkey0,$inout5
movdqa $twres,$rndkey0
paddd $twres,$twres
aesdec $rndkey1,$inout0
pxor @tweak[5],@tweak[3]
psrad \$31,$rndkey0
aesdec $rndkey1,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$rndkey0
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
pxor $rndkey0,@tweak[5]
$movkey ($key_),$rndkey0
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
$movkey 16($key_),$rndkey1
pxor @tweak[5],@tweak[4]
psrad \$31,$twres
aesdeclast `16*0`(%rsp),$inout0
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
aesdeclast `16*1`(%rsp),$inout1
aesdeclast `16*2`(%rsp),$inout2
pxor $twres,@tweak[5]
aesdeclast `16*3`(%rsp),$inout3
aesdeclast `16*4`(%rsp),$inout4
aesdeclast `16*5`(%rsp),$inout5
mov $rnds_,$rounds # restore $rounds
xorps `16*2`(%rsp),$inout2
movups $inout0,`16*0`($out) # write output
xorps `16*3`(%rsp),$inout3
movups $inout1,`16*1`($out)
xorps `16*4`(%rsp),$inout4
movups $inout2,`16*2`($out)
xorps `16*5`(%rsp),$inout5
movups $inout3,`16*3`($out)
mov $rnds_,$rounds # restore $rounds
movups $inout4,`16*4`($out)
movups $inout5,`16*5`($out)
lea `16*6`($out),$out
movups $inout0,`-16*6`($out) # write output
movups $inout1,`-16*5`($out)
movups $inout2,`-16*4`($out)
movups $inout3,`-16*3`($out)
movups $inout4,`-16*2`($out)
movups $inout5,`-16*1`($out)
sub \$16*6,$len
jnc .Lxts_dec_grandloop
lea
3
($rounds,$rounds),$rounds # restore original value
lea
7
($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_dec_short:
pxor $rndkey0,@tweak[0]
pxor $rndkey0,@tweak[1]
add \$16*6,$len
jz .Lxts_dec_done
pxor $rndkey0,@tweak[2]
cmp \$0x20,$len
jb .Lxts_dec_one
pxor $rndkey0,@tweak[3]
je .Lxts_dec_two
pxor $rndkey0,@tweak[4]
cmp \$0x40,$len
jb .Lxts_dec_three
je .Lxts_dec_four
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu ($inp),$inout0
pand $twmask,$twres # isolate carry and residue
movdqu 16*1($inp),$inout1
pxor $twres,@tweak[5]
movdqu ($inp),$inout0
movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
...
...
@@ -2156,7 +2229,7 @@ $code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[
5
],@tweak[1]
movdqa @tweak[
4
],@tweak[1]
xorps @tweak[2],$inout2
movups $inout0,($out)
movups $inout1,16*1($out)
...
...
@@ -2166,14 +2239,8 @@ $code.=<<___;
.align 16
.Lxts_dec_four:
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movups ($inp),$inout0
pand $twmask,$twres # isolate carry and residue
movups 16*1($inp),$inout1
pxor $twres,@tweak[5]
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
...
...
@@ -2184,16 +2251,16 @@ $code.=<<___;
call _aesni_decrypt4
xorps
@tweak[0],$inout0
pxor
@tweak[0],$inout0
movdqa @tweak[4],@tweak[0]
xorps
@tweak[1],$inout1
pxor
@tweak[1],$inout1
movdqa @tweak[5],@tweak[1]
xorps
@tweak[2],$inout2
mov
ups
$inout0,($out)
xorps
@tweak[3],$inout3
mov
ups
$inout1,16*1($out)
mov
ups
$inout2,16*2($out)
mov
ups
$inout3,16*3($out)
pxor
@tweak[2],$inout2
mov
dqu
$inout0,($out)
pxor
@tweak[3],$inout3
mov
dqu
$inout1,16*1($out)
mov
dqu
$inout2,16*2($out)
mov
dqu
$inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_dec_done
...
...
@@ -3240,6 +3307,19 @@ sub aesni {
push
@opcode
,
0xc0
|
(
$
2
&
7
)
|
((
$
3
&
7
)
<<
3
);
# ModR/M
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
elsif
(
$line
=~
/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/
)
{
my
%opcodelet
=
(
"
aesenc
"
=>
0xdc
,
"
aesenclast
"
=>
0xdd
,
"
aesdec
"
=>
0xde
,
"
aesdeclast
"
=>
0xdf
);
return
undef
if
(
!
defined
(
$opcodelet
{
$
1
}));
my
$off
=
$
2
;
push
@opcode
,
0x44
if
(
$
3
>=
8
);
push
@opcode
,
0x0f
,
0x38
,
$opcodelet
{
$
1
};
push
@opcode
,
0x44
|
((
$
3
&
7
)
<<
3
),
0x24
;
# ModR/M
push
@opcode
,(
$off
=~
/^0/
?
oct
(
$off
):
$off
)
&
0xff
;
return
"
.byte
\t
"
.
join
('
,
',
@opcode
);
}
return
$line
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录