Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
btwise
openssl
提交
98dc1784
O
openssl
项目概览
btwise
/
openssl
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
98dc1784
编写于
11月 12, 2012
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
aes-x86_64.pl: Atom-specific optimizations, +10%.
vpaes-x86_64.pl: minor performance squeeze.
上级
89f1eb82
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
164 addition
and
167 deletion
+164
-167
crypto/aes/asm/aes-x86_64.pl
crypto/aes/asm/aes-x86_64.pl
+122
-124
crypto/aes/asm/vpaes-x86_64.pl
crypto/aes/asm/vpaes-x86_64.pl
+42
-43
未找到文件。
crypto/aes/asm/aes-x86_64.pl
浏览文件 @
98dc1784
...
...
@@ -19,9 +19,10 @@
# Performance in number of cycles per processed byte for 128-bit key:
#
# ECB encrypt ECB decrypt CBC large chunk
# AMD64 33 41 13.0
# EM64T 38 59 18.6(*)
# Core 2 30 43 14.5(*)
# AMD64 33 43 13.0
# EM64T 38 56 18.6(*)
# Core 2 30 42 14.5(*)
# Atom 65 86 32.1(*)
#
# (*) with hyper-threading off
...
...
@@ -365,68 +366,66 @@ $code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb `&lo("$s3")`,$t3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
shr \$16,$s2
movzb `&hi("$s3")`,$acc2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb ($sbox,$t3,1),$t3
movzb ($sbox,$acc0,1),$t4 #$t0
movzb ($sbox,$acc1,1),$t5 #$t1
movzb
`&hi("$s3")`,$acc2
movzb
($sbox,$acc0,1),$t4 #$t0
movzb `&hi("$s0")`,$acc0
shr \$16,$s2
movzb ($sbox,$acc1,1),$t5 #$t1
movzb `&lo("$s2")`,$acc1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
shr \$16,$s3
movzb `&lo("$s2")`,$acc1
shl \$8,$t4
shr \$16,$s3
shl \$8,$t5
movzb ($sbox,$acc1,1),$acc1 #$t0
xor $t4,$t0
xor $t5,$t1
movzb `&lo("$s3")`,$t4
shr \$16,$s0
movzb `&lo("$s3")`,$t4
shr \$16,$s1
movzb `&lo("$s0")`,$t5
xor $t5,$t1
shl \$8,$acc2
shl \$8,$acc0
movzb ($sbox,$t4,1),$t4 #$t1
movzb ($sbox,$t5,1),$t5 #$t2
movzb `&lo("$s0")`,$t5
movzb ($sbox,$acc1,1),$acc1 #$t0
xor $acc2,$t2
xor $acc0,$t3
shl \$8,$acc0
movzb `&lo("$s1")`,$acc2
movzb `&hi("$s3")`,$acc0
shl \$16,$acc1
movzb ($sbox,$acc2,1),$acc2 #$t3
movzb ($sbox,$acc0,1),$acc0 #$t0
xor $acc0,$t3
movzb ($sbox,$t4,1),$t4 #$t1
movzb `&hi("$s3")`,$acc0
movzb ($sbox,$t5,1),$t5 #$t2
xor $acc1,$t0
movzb `&hi("$s0")`,$acc1
shr \$8,$s2
movzb `&hi("$s0")`,$acc1
shl \$16,$t4
shr \$8,$s1
shl \$16,$t5
xor $t4,$t1
movzb ($sbox,$acc2,1),$acc2 #$t3
movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$acc1 #$t1
movzb ($sbox,$s2,1),$s3 #$t3
movzb ($sbox,$s1,1),$s2 #$t2
shl \$16,$t4
shl \$16,$t5
shl \$16,$acc2
xor $t4,$t1
xor $t5,$t2
xor $acc2,$t3
shl \$24,$acc0
xor $acc2,$t3
shl \$24,$acc1
shl \$24,$s3
xor $acc0,$t0
shl \$24,$s
2
shl \$24,$s
3
xor $acc1,$t1
shl \$24,$s2
mov $t0,$s0
mov $t1,$s1
xor $t2,$s2
...
...
@@ -465,12 +464,12 @@ sub enctransform()
{
my
(
$t3
,
$r20
,
$r21
)
=
(
$acc2
,"
%r8d
","
%r9d
");
$code
.=
<<___;
mov
$s0,$acc
0
mov
$s1,$acc
1
and
\$0x80808080,$acc
0
and
\$0x80808080,$acc
1
mov $
acc0,$t
0
mov $
acc1,$t
1
mov
\$0x80808080,$t
0
mov
\$0x80808080,$t
1
and
$s0,$t
0
and
$s1,$t
1
mov $
t0,$acc
0
mov $
t1,$acc
1
shr \$7,$t0
lea ($s0,$s0),$r20
shr \$7,$t1
...
...
@@ -488,25 +487,25 @@ $code.=<<___;
xor $r20,$s0
xor $r21,$s1
mov $s2,$acc0
mov $s3,$acc1
mov \$0x80808080,$t2
rol \$24,$s0
mov \$0x80808080,$t3
rol \$24,$s1
and
\$0x80808080,$acc0
and
\$0x80808080,$acc1
and
$s2,$t2
and
$s3,$t3
xor $r20,$s0
xor $r21,$s1
mov $acc0,$t2
mov $acc1,$t3
mov $t2,$acc0
ror \$16,$t0
mov $t3,$acc1
ror \$16,$t1
shr \$7,$t2
lea ($s2,$s2),$r20
shr \$7,$t2
xor $t0,$s0
xor $t1,$s1
shr \$7,$t3
lea ($s3,$s3),$r2
1
xor $t1,$s
1
ror \$8,$t0
lea ($s3,$s3),$r21
ror \$8,$t1
sub $t2,$acc0
sub $t3,$acc1
...
...
@@ -522,23 +521,23 @@ $code.=<<___;
xor $acc0,$r20
xor $acc1,$r21
ror \$16,$t2
xor $r20,$s2
ror \$16,$t3
xor $r21,$s3
rol \$24,$s2
mov 0($sbox),$acc0 # prefetch Te4
rol \$24,$s3
xor $r20,$s2
xor $r21,$s3
mov 0($sbox),$acc0 # prefetch Te4
ror \$16,$t2
ror \$16,$t3
mov 64($sbox),$acc1
xor $t2,$s2
xor $t3,$s3
xor $r21,$s3
mov 128($sbox),$r20
xor $t2,$s2
ror \$8,$t2
xor $t3,$s3
ror \$8,$t3
mov 192($sbox),$r21
xor $t2,$s2
mov 192($sbox),$r21
xor $t3,$s3
___
}
...
...
@@ -935,70 +934,69 @@ $code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb `&lo("$s3")`,$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
shr \$16,$s3
movzb `&hi("$s1")`,$acc2
movzb ($sbox,$t0,1),$t0
movzb ($sbox,$t1,1),$t1
movzb ($sbox,$t2,1),$t2
movzb ($sbox,$t3,1),$t3
movzb ($sbox,$acc0,1),$t4 #$t0
movzb ($sbox,$acc1,1),$t5 #$t1
movzb
`&hi("$s1")`,$acc2
movzb
($sbox,$acc0,1),$t4 #$t0
movzb `&hi("$s2")`,$acc0
shr \$16,$s2
movzb ($sbox,$acc1,1),$t5 #$t1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
shr \$16,$s3
movzb `&lo("$s2")`,$acc1
shl \$8,$t4
shr \$16,$s2
shl \$8,$t5
movzb ($sbox,$acc1,1),$acc1 #$t0
xor $t4,$t0
xor $t5,$t1
movzb `&lo("$s3")`,$t4
shl \$8,$t4
movzb `&lo("$s2")`,$acc1
shr \$16,$s0
xor $t4,$t0
shr \$16,$s1
movzb `&lo("$s0")`,$t5
movzb `&lo("$s3")`,$t4
shl \$8,$acc2
xor $t5,$t1
shl \$8,$acc0
movzb
($sbox,$t4,1),$t4 #$t1
movzb ($sbox,$
t5,1),$t5 #$t2
movzb
`&lo("$s0")`,$t5
movzb ($sbox,$
acc1,1),$acc1 #$t0
xor $acc2,$t2
xor $acc0,$t3
movzb `&lo("$s1")`,$acc2
movzb `&hi("$s1")`,$acc0
shl \$16,$acc1
xor $acc0,$t3
movzb ($sbox,$t4,1),$t4 #$t1
movzb `&hi("$s1")`,$acc0
movzb ($sbox,$acc2,1),$acc2 #$t3
movzb ($sbox,$acc0,1),$acc0 #$t0
xor $acc1,$t0
movzb ($sbox,$t5,1),$t5 #$t2
movzb `&hi("$s2")`,$acc1
shl \$16,$acc2
shl \$16,$t4
shl \$16,$t5
movzb ($sbox,$acc1,1),$s1 #$t1
xor $acc2,$t3
movzb `&hi("$s3")`,$acc2
xor $t4,$t1
shr \$8,$s0
xor $t5,$t2
movzb `&hi("$s3")`,$acc1
shr \$8,$s0
shl \$16,$acc2
movzb ($sbox,$acc1,1),$s2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$s1 #$t1
movzb ($sbox,$acc2,1),$s2 #$t2
movzb ($sbox,$s0,1),$s3 #$t3
xor $acc2,$t3
mov $t0,$s0
shl \$24,$acc0
shl \$24,$s1
shl \$24,$s2
xor $acc0,$
t
0
xor $acc0,$
s
0
shl \$24,$s3
xor $t1,$s1
mov $t0,$s0
xor $t2,$s2
xor $t3,$s3
___
...
...
@@ -1013,12 +1011,12 @@ sub dectransform()
my
$prefetch
=
shift
;
$code
.=
<<___;
mov $
tp10,$acc
0
mov $
tp18,$acc
8
and $
mask80,$acc
0
and $
mask80,$acc
8
mov $
acc0,$tp4
0
mov $
acc8,$tp4
8
mov $
mask80,$tp4
0
mov $
mask80,$tp4
8
and $
tp10,$tp4
0
and $
tp18,$tp4
8
mov $
tp40,$acc
0
mov $
tp48,$acc
8
shr \$7,$tp40
lea ($tp10,$tp10),$tp20
shr \$7,$tp48
...
...
@@ -1029,15 +1027,15 @@ $code.=<<___;
and $maskfe,$tp28
and $mask1b,$acc0
and $mask1b,$acc8
xor $
tp20,$acc
0
xor $
tp28,$acc
8
mov $
acc0,$tp2
0
mov $
acc8,$tp2
8
and $
mask80,$acc
0
and $
mask80,$acc
8
mov $
acc0,$tp8
0
mov $
acc8,$tp8
8
xor $
acc0,$tp2
0
xor $
acc8,$tp2
8
mov $
mask80,$tp8
0
mov $
mask80,$tp8
8
and $
tp20,$tp8
0
and $
tp28,$tp8
8
mov $
tp80,$acc
0
mov $
tp88,$acc
8
shr \$7,$tp80
lea ($tp20,$tp20),$tp40
shr \$7,$tp88
...
...
@@ -1048,15 +1046,15 @@ $code.=<<___;
and $maskfe,$tp48
and $mask1b,$acc0
and $mask1b,$acc8
xor $
tp40,$acc
0
xor $
tp48,$acc
8
mov $
acc0,$tp4
0
mov $
acc8,$tp4
8
and $
mask80,$acc
0
and $
mask80,$acc
8
mov $
acc0,$tp8
0
mov $
acc8,$tp8
8
xor $
acc0,$tp4
0
xor $
acc8,$tp4
8
mov $
mask80,$tp8
0
mov $
mask80,$tp8
8
and $
tp40,$tp8
0
and $
tp48,$tp8
8
mov $
tp80,$acc
0
mov $
tp88,$acc
8
shr \$7,$tp80
xor $tp10,$tp20 # tp2^=tp1
shr \$7,$tp88
...
...
@@ -1081,51 +1079,51 @@ $code.=<<___;
mov $tp10,$acc0
mov $tp18,$acc8
xor $tp80,$tp40 # tp4^tp1^=tp8
xor $tp88,$tp48 # tp4^tp1^=tp8
shr \$32,$acc0
xor $tp88,$tp48 # tp4^tp1^=tp8
shr \$32,$acc8
xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
xor `&LO("$tp80")`,`&LO("$tp10")`
xor `&LO("$tp88")`,`&LO("$tp18")`
shr \$32,$tp80
xor `&LO("$tp88")`,`&LO("$tp18")`
shr \$32,$tp88
xor `&LO("$tp80")`,`&LO("$acc0")`
xor `&LO("$tp88")`,`&LO("$acc8")`
mov $tp20,$tp80
mov $tp28,$tp88
shr \$32,$tp80
shr \$32,$tp88
rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
mov $tp28,$tp88
rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
shr \$32,$tp80
xor `&LO("$tp20")`,`&LO("$tp10")`
shr \$32,$tp88
xor `&LO("$tp28")`,`&LO("$tp18")`
rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
mov $tp40,$tp20
rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
mov $tp48,$tp28
shr \$32,$tp20
xor `&LO("$tp80")`,`&LO("$acc0")`
shr \$32,$tp28
xor `&LO("$tp88")`,`&LO("$acc8")`
`"mov 0($sbox),$mask80" if ($prefetch)`
shr \$32,$tp20
shr \$32,$tp28
`"mov 64($sbox),$maskfe" if ($prefetch)`
rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
`"mov 64($sbox),$maskfe" if ($prefetch)`
rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
`"mov 128($sbox),$mask1b" if ($prefetch)`
rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
`"mov 192($sbox),$tp80" if ($prefetch)`
xor `&LO("$tp40")`,`&LO("$tp10")`
rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
xor `&LO("$tp48")`,`&LO("$tp18")`
`"mov 256($sbox),$tp88" if ($prefetch)`
xor `&LO("$tp20")`,`&LO("$acc0")`
...
...
crypto/aes/asm/vpaes-x86_64.pl
浏览文件 @
98dc1784
...
...
@@ -27,9 +27,9 @@
#
# aes-x86_64.pl vpaes-x86_64.pl
#
# Core 2(**)
30.5/43.7/14.3 21.8/25.7
(***)
# Nehalem
30.5/42.2/14.6 9.8
/11.8
# Atom
63.9/79.0/32.1 64.0/84.8
(***)
# Core 2(**)
29.6/41.1/14.3 21.9/25.2
(***)
# Nehalem
29.6/40.3/14.6 10.0
/11.8
# Atom
57.3/74.2/32.1 60.9/82.3
(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
...
...
@@ -40,7 +40,7 @@
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +
40%/78
% improvement on Core 2
# pshufb, yet it's respectable +
36%/62
% improvement on Core 2
# (as implied, over "hyper-threading-safe" code path).
#
# <appro@openssl.org>
...
...
@@ -94,8 +94,8 @@ _vpaes_encrypt_core:
movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
pshufb %xmm1, %xmm0
pxor %xmm5, %xmm2
pxor %xmm2, %xmm0
add \$16, %r9
pxor %xmm2, %xmm0
lea .Lk_mc_backward(%rip),%r10
jmp .Lenc_entry
...
...
@@ -103,19 +103,19 @@ _vpaes_encrypt_core:
.Lenc_loop:
# middle of middle round
movdqa %xmm13, %xmm4 # 4 : sb1u
pshufb %xmm2, %xmm4 # 4 = sb1u
pxor %xmm5, %xmm4 # 4 = sb1u + k
movdqa %xmm12, %xmm0 # 0 : sb1t
pshufb %xmm2, %xmm4 # 4 = sb1u
pshufb %xmm3, %xmm0 # 0 = sb1t
pxor %xmm
4, %xmm0 # 0 = A
pxor %xmm
5, %xmm4 # 4 = sb1u + k
movdqa %xmm15, %xmm5 # 4 : sb2u
p
shufb %xmm2, %xmm5 # 4 = sb2u
p
xor %xmm4, %xmm0 # 0 = A
movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
pshufb %xmm2, %xmm5 # 4 = sb2u
movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
movdqa %xmm14, %xmm2 # 2 : sb2t
pshufb %xmm3, %xmm2 # 2 = sb2t
pxor %xmm5, %xmm2 # 2 = 2A
movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
movdqa %xmm0, %xmm3 # 3 = A
pxor %xmm5, %xmm2 # 2 = 2A
pshufb %xmm1, %xmm0 # 0 = B
add \$16, %r9 # next key
pxor %xmm2, %xmm0 # 0 = 2A+B
...
...
@@ -124,30 +124,30 @@ _vpaes_encrypt_core:
pxor %xmm0, %xmm3 # 3 = 2A+B+D
pshufb %xmm1, %xmm0 # 0 = 2B+C
and \$0x30, %r11 # ... mod 4
pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
sub \$1,%rax # nr--
pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
.Lenc_entry:
# top of round
movdqa %xmm9, %xmm1 # 1 : i
movdqa %xmm11, %xmm5 # 2 : a/k
pandn %xmm0, %xmm1 # 1 = i<<4
psrld \$4, %xmm1 # 1 = i
pand %xmm9, %xmm0 # 0 = k
movdqa %xmm11, %xmm5 # 2 : a/k
pshufb %xmm0, %xmm5 # 2 = a/k
pxor %xmm1, %xmm0 # 0 = j
movdqa %xmm10, %xmm3 # 3 : 1/i
pxor %xmm1, %xmm0 # 0 = j
pshufb %xmm1, %xmm3 # 3 = 1/i
pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
movdqa %xmm10, %xmm4 # 4 : 1/j
pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
pshufb %xmm0, %xmm4 # 4 = 1/j
pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
movdqa %xmm10, %xmm2 # 2 : 1/iak
pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
pshufb %xmm3, %xmm2 # 2 = 1/iak
pxor %xmm0, %xmm2 # 2 = io
movdqa %xmm10, %xmm3 # 3 : 1/jak
movdqu (%r9), %xmm5
pxor %xmm0, %xmm2 # 2 = io
pshufb %xmm4, %xmm3 # 3 = 1/jak
movdqu (%r9), %xmm5
pxor %xmm1, %xmm3 # 3 = jo
jnz .Lenc_loop
...
...
@@ -200,62 +200,61 @@ _vpaes_decrypt_core:
## Inverse mix columns
##
movdqa -0x20(%r10),%xmm4 # 4 : sb9u
movdqa -0x10(%r10),%xmm1 # 0 : sb9t
pshufb %xmm2, %xmm4 # 4 = sb9u
pshufb %xmm3, %xmm1 # 0 = sb9t
pxor %xmm0, %xmm4
movdqa -0x10(%r10),%xmm0 # 0 : sb9t
pshufb %xmm3, %xmm0 # 0 = sb9t
pxor %xmm4, %xmm0 # 0 = ch
add \$16, %r9 # next round key
pxor %xmm4, %xmm1 # 0 = ch
pshufb %xmm5, %xmm0 # MC ch
movdqa 0x00(%r10),%xmm4 # 4 : sbdu
pshufb %xmm5, %xmm1 # MC ch
pshufb %xmm2, %xmm4 # 4 = sbdu
pxor %xmm0, %xmm4 # 4 = ch
movdqa 0x10(%r10),%xmm0 # 0 : sbdt
pxor %xmm1, %xmm4 # 4 = ch
pshufb %xmm3, %xmm0 # 0 = sbdt
pxor %xmm4, %xmm0 # 0 = ch
sub \$1,%rax # nr--
pxor %xmm4, %xmm0 # 0 = ch
pshufb %xmm5, %xmm0 # MC ch
movdqa 0x20(%r10),%xmm4 # 4 : sbbu
pshufb %xmm5, %xmm0 # MC ch
movdqa 0x30(%r10),%xmm1 # 0 : sbbt
pshufb %xmm2, %xmm4 # 4 = sbbu
pshufb %xmm3, %xmm1 # 0 = sbbt
pxor %xmm0, %xmm4 # 4 = ch
movdqa 0x30(%r10),%xmm0 # 0 : sbbt
pshufb %xmm3, %xmm0 # 0 = sbbt
pxor %xmm4, %xmm0 # 0 = ch
pxor %xmm4, %xmm1 # 0 = ch
pshufb %xmm5, %xmm0 # MC ch
movdqa 0x40(%r10),%xmm4 # 4 : sbeu
pshufb %xmm2, %xmm4 # 4 = sbeu
pxor %xmm0, %xmm4 # 4 = ch
pshufb %xmm5, %xmm1 # MC ch
movdqa 0x50(%r10),%xmm0 # 0 : sbet
pshufb %xmm2, %xmm4 # 4 = sbeu
pshufb %xmm3, %xmm0 # 0 = sbet
pxor %xmm4, %xmm0 # 0 = ch
palignr \$12, %xmm5, %xmm5
pxor %xmm1, %xmm4 # 4 = ch
pxor %xmm4, %xmm0 # 0 = ch
.Ldec_entry:
# top of round
movdqa %xmm9, %xmm1 # 1 : i
pandn %xmm0, %xmm1 # 1 = i<<4
movdqa %xmm11, %xmm2 # 2 : a/k
psrld \$4, %xmm1 # 1 = i
pand %xmm9, %xmm0 # 0 = k
movdqa %xmm11, %xmm2 # 2 : a/k
pshufb %xmm0, %xmm2 # 2 = a/k
pxor %xmm1, %xmm0 # 0 = j
movdqa %xmm10, %xmm3 # 3 : 1/i
pxor %xmm1, %xmm0 # 0 = j
pshufb %xmm1, %xmm3 # 3 = 1/i
pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
movdqa %xmm10, %xmm4 # 4 : 1/j
pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
pshufb %xmm0, %xmm4 # 4 = 1/j
pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
movdqa %xmm10, %xmm2 # 2 : 1/iak
pshufb %xmm3, %xmm2 # 2 = 1/iak
pxor %xmm0, %xmm2 # 2 = io
movdqa %xmm10, %xmm3 # 3 : 1/jak
pxor %xmm0, %xmm2 # 2 = io
pshufb %xmm4, %xmm3 # 3 = 1/jak
pxor %xmm1, %xmm3 # 3 = jo
movdqu (%r9), %xmm0
pxor %xmm1, %xmm3 # 3 = jo
jnz .Ldec_loop
# middle of last round
...
...
@@ -463,12 +462,12 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,\@abi-omnipotent
.align 16
_vpaes_schedule_192_smear:
pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
pxor %xmm0, %xmm6 # -> c+d c 0 0
pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
pxor %xmm1, %xmm6 # -> c+d c 0 0
pxor %xmm1, %xmm1
pxor %xmm0, %xmm6 # -> b+c+d b+c b a
movdqa %xmm6, %xmm0
pxor %xmm1, %xmm1
movhlps %xmm1, %xmm6 # clobber low side with zeros
ret
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录