Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
btwise
openssl
提交
3847d15d
O
openssl
项目概览
btwise
/
openssl
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
3847d15d
编写于
2月 05, 2014
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[aesni|sha*]-mb-x86_64.pl: add data prefetching.
上级
3ef477c6
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
91 addition
and
20 deletion
+91
-20
crypto/aes/asm/aesni-mb-x86_64.pl
crypto/aes/asm/aesni-mb-x86_64.pl
+44
-8
crypto/sha/asm/sha1-mb-x86_64.pl
crypto/sha/asm/sha1-mb-x86_64.pl
+31
-9
crypto/sha/asm/sha256-mb-x86_64.pl
crypto/sha/asm/sha256-mb-x86_64.pl
+16
-3
未找到文件。
crypto/aes/asm/aesni-mb-x86_64.pl
浏览文件 @
3847d15d
...
...
@@ -15,8 +15,8 @@
# asymptotic measured
# ---------------------------
# Westmere 5.00/4=1.25 5.13/4=1.28
# Atom 15.0/4=3.75 15.7/4=3.93
# Sandy Bridge 5.06/4=1.27 5.1
5
/4=1.29
# Atom 15.0/4=3.75
?
15.7/4=3.93
# Sandy Bridge 5.06/4=1.27 5.1
8
/4=1.29
# Ivy Bridge 5.06/4=1.27 5.14/4=1.29
# Haswell 4.44/4=1.11 4.44/4=1.11
# Bulldozer 5.75/4=1.44 5.76/4=1.44
...
...
@@ -27,8 +27,8 @@
#
# asymptotic measured
# ---------------------------
# Sandy Bridge 5.06/8=0.64 7.
05/8=0.88
(*)
# Ivy Bridge 5.06/8=0.64 7.
02/8=0.88
(*)
# Sandy Bridge 5.06/8=0.64 7.
10/8=0.89
(*)
# Ivy Bridge 5.06/8=0.64 7.
14/8=0.89
(*)
# Haswell 5.00/8=0.63 5.00/8=0.63
# Bulldozer 5.75/8=0.72 5.77/8=0.72
#
...
...
@@ -188,7 +188,11 @@ $code.=<<___;
sub $offset,$sink
aesenc $rndkey1,@out[0]
prefetcht0 31(@inptr[0],$offset) # prefetch input
prefetcht0 31(@inptr[1],$offset)
aesenc $rndkey1,@out[1]
prefetcht0 31(@inptr[2],$offset)
prefetcht0 31(@inptr[2],$offset)
aesenc $rndkey1,@out[2]
aesenc $rndkey1,@out[3]
movups 0x30-0x78($key),$rndkey1
...
...
@@ -199,8 +203,8 @@ $code.=<<___;
cmp `32+4*$i`(%rsp),$one
aesenc $rndkey,@out[0]
aesenc $rndkey,@out[1]
cmovge $sink,@inptr[$i] # cancel input
aesenc $rndkey,@out[2]
cmovge $sink,@inptr[$i] # cancel input
cmovg $sink,@outptr[$i] # sink output
aesenc $rndkey,@out[3]
movups `0x40+16*$i-0x78`($key),$rndkey
...
...
@@ -209,7 +213,11 @@ ___
$code
.=
<<___;
movdqa $counters,$mask
aesenc $rndkey0,@out[0]
prefetcht0 15(@outptr[0],$offset) # prefetch output
prefetcht0 15(@outptr[1],$offset)
aesenc $rndkey0,@out[1]
prefetcht0 15(@outptr[2],$offset)
prefetcht0 15(@outptr[3],$offset)
aesenc $rndkey0,@out[2]
aesenc $rndkey0,@out[3]
movups 0x80-0x78($key),$rndkey0
...
...
@@ -260,13 +268,15 @@ $code.=<<___;
aesenc $rndkey0,@out[2]
aesenc $rndkey0,@out[3]
movups 0xe0-0x78($key),$rndkey0
jmp .Lenc4x_tail
.align 32
.Lenc4x_tail:
aesenc $rndkey1,@out[0]
aesenc $rndkey1,@out[1]
aesenc $rndkey1,@out[2]
movdqu (@inptr[0],$offset),@inp[0]
aesenc $rndkey1,@out[3]
movdqu (@inptr[0],$offset),@inp[0]
movdqu 0x10-0x78($key),$rndkey1
aesenclast $rndkey0,@out[0]
...
...
@@ -426,7 +436,11 @@ $code.=<<___;
sub $offset,$sink
aesdec $rndkey1,@out[0]
prefetcht0 31(@inptr[0],$offset) # prefetch input
prefetcht0 31(@inptr[1],$offset)
aesdec $rndkey1,@out[1]
prefetcht0 31(@inptr[2],$offset)
prefetcht0 31(@inptr[3],$offset)
aesdec $rndkey1,@out[2]
aesdec $rndkey1,@out[3]
movups 0x30-0x78($key),$rndkey1
...
...
@@ -447,7 +461,11 @@ ___
$code
.=
<<___;
movdqa $counters,$mask
aesdec $rndkey0,@out[0]
prefetcht0 15(@outptr[0],$offset) # prefetch output
prefetcht0 15(@outptr[1],$offset)
aesdec $rndkey0,@out[1]
prefetcht0 15(@outptr[2],$offset)
prefetcht0 15(@outptr[3],$offset)
aesdec $rndkey0,@out[2]
aesdec $rndkey0,@out[3]
movups 0x80-0x78($key),$rndkey0
...
...
@@ -498,7 +516,9 @@ $code.=<<___;
aesdec $rndkey0,@out[2]
aesdec $rndkey0,@out[3]
movups 0xe0-0x78($key),$rndkey0
jmp .Ldec4x_tail
.align 32
.Ldec4x_tail:
aesdec $rndkey1,@out[0]
aesdec $rndkey1,@out[1]
...
...
@@ -512,12 +532,12 @@ $code.=<<___;
movdqu 0x20-0x78($key),$rndkey0
aesdeclast @inp[0],@out[0]
movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
aesdeclast @inp[1],@out[1]
movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
movdqu -16(@inptr[1],$offset),@inp[1]
aesdeclast @inp[2],@out[2]
movdqu -16(@inptr[2],$offset),@inp[2]
aesdeclast @inp[3],@out[3]
movdqu -16(@inptr[2],$offset),@inp[2]
movdqu -16(@inptr[3],$offset),@inp[3]
movups @out[0],-16(@outptr[0],$offset)
...
...
@@ -682,7 +702,13 @@ $code.=<<___ if ($i);
___
$code
.=
<<___;
vaesenc $rndkey,@out[1],@out[1]
prefetcht0 31(@ptr[$i]) # prefetch input
vaesenc $rndkey,@out[2],@out[2]
___
$code
.=<<
___
if
(
$i
>
1
);
prefetcht0
15
(
@ptr
[
$i
-
2
])
# prefetch output
___
$code
.=
<<___;
vaesenc $rndkey,@out[3],@out[3]
lea (@ptr[$i],$offset),$offset
cmovge %rsp,@ptr[$i] # cancel input
...
...
@@ -703,6 +729,8 @@ ___
}
$code
.=
<<___;
vmovdqu 32(%rsp),$counters
prefetcht0 15(@ptr[$i-2]) # prefetch output
prefetcht0 15(@ptr[$i-1])
cmp \$11,$rounds
jb .Lenc8x_tail
...
...
@@ -958,7 +986,13 @@ $code.=<<___ if ($i);
___
$code
.=
<<___;
vaesdec $rndkey,@out[1],@out[1]
prefetcht0 31(@ptr[$i]) # prefetch input
vaesdec $rndkey,@out[2],@out[2]
___
$code
.=<<
___
if
(
$i
>
1
);
prefetcht0
15
(
@ptr
[
$i
-
2
])
# prefetch output
___
$code
.=
<<___;
vaesdec $rndkey,@out[3],@out[3]
lea (@ptr[$i],$offset),$offset
cmovge %rsp,@ptr[$i] # cancel input
...
...
@@ -979,6 +1013,8 @@ ___
}
$code
.=
<<___;
vmovdqu 32(%rsp),$counters
prefetcht0 15(@ptr[$i-2]) # prefetch output
prefetcht0 15(@ptr[$i-1])
cmp \$11,$rounds
jb .Ldec8x_tail
...
...
crypto/sha/asm/sha1-mb-x86_64.pl
浏览文件 @
3847d15d
...
...
@@ -14,20 +14,21 @@
#
# this +aesni(i) sha1 aesni-sha1 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 10.
4/n +1.28=3.88(n=4) 5.44 6.58 +70
%
# Atom(ii) 18.9/n +3.93=8.66(n=4) 10.0 14.0 +62%
# Westmere(ii) 10.
7/n +1.28=3.96(n=4) 5.30 6.66 +68
%
# Atom(ii) 18.9
?
/n +3.93=8.66(n=4) 10.0 14.0 +62%
# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
# Ivy Bridge (8.0
3
+5.14=13.2)/n 4.60 5.54 +68%
# Ivy Bridge (8.0
8
+5.14=13.2)/n 4.60 5.54 +68%
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
# Bulldozer (9.7
5
+5.76=15.5)/n 5.95 6.37 +64%
# Bulldozer (9.7
6
+5.76=15.5)/n 5.95 6.37 +64%
#
# (i) multi-block CBC encrypt with 128-bit key;
# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
# because of lower AES-NI instruction throughput;
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 7.98+4.44=12.4;
# (iv) improvement coefficients in real-life application are somewhat
# lower and range from 30% to 100% (on Haswell);
# for n=4 is 8.00+4.44=12.4;
# (iv) presented improvement coefficients are asymptotic limits and
# in real-life application are somewhat lower, e.g. for 2KB
# fragments they range from 30% to 100% (on Haswell);
$flavour
=
shift
;
$output
=
shift
;
...
...
@@ -80,6 +81,14 @@ $Tbl="%rbp";
@Xi
=
map
("
%xmm
$_
",(
10
..
14
));
$K
=
"
%xmm15
";
if
(
1
)
{
# Atom-specific optimization aiming to eliminate pshufb with high
# registers [and thus get rid of 48 cycles accumulated penalty]
@Xi
=
map
("
%xmm
$_
",(
0
..
4
));
(
$tx
,
$t0
,
$t1
,
$t2
,
$t3
)
=
map
("
%xmm
$_
",(
5
..
9
));
@V
=
(
$A
,
$B
,
$C
,
$D
,
$E
)
=
map
("
%xmm
$_
",(
10
..
14
));
}
$REG_SZ
=
16
;
sub
Xi_off
{
...
...
@@ -139,8 +148,8 @@ $code.=<<___ if ($i<14); # just load input
psrld
\
$
2
,
$b
paddd
$t2
,
$e
# e+=rol(a,5)
movd
`
4*
$j
-16*4
`(
@ptr
[
2
]),
$t2
pshufb
$tx
,
@Xi
[
1
]
movd
`
4*
$j
-16*4
`(
@ptr
[
2
]),
$t2
por
$t1
,
$b
# b=rol(b,30)
___
$code
.=<<
___
if
(
$i
==
14
);
# just load input
...
...
@@ -152,6 +161,7 @@ $code.=<<___ if ($i==14); # just load input
movdqa
$b
,
$t1
movdqa
$b
,
$t0
pslld
\
$
5
,
$t2
prefetcht0
63
(
@ptr
[
0
])
pandn
$d
,
$t1
pand
$c
,
$t0
punpckldq
$t3
,
@Xi
[
1
]
...
...
@@ -162,14 +172,17 @@ $code.=<<___ if ($i==14); # just load input
psrld
\
$
27
,
$t3
pxor
$t1
,
$t0
# Ch(b,c,d)
movdqa
$b
,
$t1
prefetcht0
63
(
@ptr
[
1
])
por
$t3
,
$t2
# rol(a,5)
pslld
\
$
30
,
$t1
paddd
$t0
,
$e
# e+=Ch(b,c,d)
prefetcht0
63
(
@ptr
[
2
])
psrld
\
$
2
,
$b
paddd
$t2
,
$e
# e+=rol(a,5)
pshufb
$tx
,
@Xi
[
1
]
prefetcht0
63
(
@ptr
[
3
])
por
$t1
,
$b
# b=rol(b,30)
___
$code
.=<<
___
if
(
$i
>=
13
&&
$i
<
15
);
...
...
@@ -382,12 +395,12 @@ $code.=<<___;
movdqu 0x60($ctx),$D
movdqu 0x80($ctx),$E
movdqa 0x60($Tbl),$tx # pbswap_mask
movdqa -0x20($Tbl),$K # K_00_19
jmp .Loop
.align 32
.Loop:
___
$code
.=
"
movdqa -0x20(
$Tbl
),
$K
\n
";
# K_00_19
for
(
$i
=
0
;
$i
<
20
;
$i
++
)
{
&BODY_00_19
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
$code
.=
"
movdqa 0x00(
$Tbl
),
$K
\n
";
# K_20_39
for
(;
$i
<
40
;
$i
++
)
{
&BODY_20_39
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
...
...
@@ -434,6 +447,7 @@ $code.=<<___;
movdqa @Xi[0],(%rbx) # save counters
movdqa 0x60($Tbl),$tx # pbswap_mask
movdqa -0x20($Tbl),$K # K_00_19
dec $num
jnz .Loop
...
...
@@ -551,6 +565,7 @@ $code.=<<___ if ($i<14);
___
$code
.=<<
___
if
(
$i
==
14
);
vpaddd
$K
,
$e
,
$e
# e+=K_00_19
prefetcht0
63
(
@ptr
[
0
])
vpslld
\
$
5
,
$a
,
$t2
vpandn
$d
,
$b
,
$t1
vpand
$c
,
$b
,
$t0
...
...
@@ -559,14 +574,17 @@ $code.=<<___ if ($i==14);
vpaddd
@Xi
[
0
],
$e
,
$e
# e+=X[i]
$vpack
$t3
,
@Xi
[
1
],
@Xi
[
1
]
vpsrld
\
$
27
,
$a
,
$t3
prefetcht0
63
(
@ptr
[
1
])
vpxor
$t1
,
$t0
,
$t0
# Ch(b,c,d)
vpslld
\
$
30
,
$b
,
$t1
vpor
$t3
,
$t2
,
$t2
# rol(a,5)
prefetcht0
63
(
@ptr
[
2
])
vpaddd
$t0
,
$e
,
$e
# e+=Ch(b,c,d)
vpsrld
\
$
2
,
$b
,
$b
vpaddd
$t2
,
$e
,
$e
# e+=rol(a,5)
prefetcht0
63
(
@ptr
[
3
])
vpshufb
$tx
,
@Xi
[
1
],
@Xi
[
1
]
vpor
$t1
,
$b
,
$b
# b=rol(b,30)
___
...
...
@@ -580,6 +598,7 @@ $code.=<<___ if ($i>=15); # apply Xupdate
vpaddd
$K
,
$e
,
$e
# e+=K_00_19
vpslld
\
$
5
,
$a
,
$t2
vpandn
$d
,
$b
,
$t1
`
"prefetcht0 63(
@ptr
[4])" if (
$i
==15 &&
$REG_SZ
==32)
`
vpand
$c
,
$b
,
$t0
vmovdqa
@Xi
[
0
],`
&Xi_off(
$i
)
`
...
...
@@ -588,14 +607,17 @@ $code.=<<___ if ($i>=15); # apply Xupdate
vpsrld
\
$
27
,
$a
,
$t3
vpxor
$t1
,
$t0
,
$t0
# Ch(b,c,d)
vpxor
@Xi
[
3
],
@Xi
[
1
],
@Xi
[
1
]
`
"prefetcht0 63(
@ptr
[5])" if (
$i
==15 &&
$REG_SZ
==32)
`
vpslld
\
$
30
,
$b
,
$t1
vpor
$t3
,
$t2
,
$t2
# rol(a,5)
vpaddd
$t0
,
$e
,
$e
# e+=Ch(b,c,d)
`
"prefetcht0 63(
@ptr
[6])" if (
$i
==15 &&
$REG_SZ
==32)
`
vpsrld
\
$
31
,
@Xi
[
1
],
$tx
vpaddd
@Xi
[
1
],
@Xi
[
1
],
@Xi
[
1
]
vpsrld
\
$
2
,
$b
,
$b
`
"prefetcht0 63(
@ptr
[7])" if (
$i
==15 &&
$REG_SZ
==32)
`
vpaddd
$t2
,
$e
,
$e
# e+=rol(a,5)
vpor
$tx
,
@Xi
[
1
],
@Xi
[
1
]
# rol \$1,@Xi[1]
vpor
$t1
,
$b
,
$b
# b=rol(b,30)
...
...
crypto/sha/asm/sha256-mb-x86_64.pl
浏览文件 @
3847d15d
...
...
@@ -15,7 +15,7 @@
# this +aesni(i) sha256 aesni-sha256 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
# Atom(ii) 39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
# Atom(ii)
?
39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
...
...
@@ -27,8 +27,9 @@
# AES-NI-SHA256 stitch for these processors;
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 20.3+4.44=24.7;
# (iv) improvement coefficients in real-life application are somewhat
# lower and range from 75% to 130% (on Haswell);
# (iv) presented improvement coefficients are asymptotic limits and
# in real-life application are somewhat lower, e.g. for 2KB
# fragments they range from 75% to 13% (on Haswell);
$flavour
=
shift
;
$output
=
shift
;
...
...
@@ -135,6 +136,7 @@ $code.=<<___;
psrld \$25-11,$t2
movdqa $e,$t1
`"prefetch 63(@ptr[0])" if ($i==15)`
pxor $t3,$sigma
movdqa $e,$axb # borrow $axb
pslld \$26-21,$t3
...
...
@@ -142,6 +144,7 @@ $code.=<<___;
pand $f,$axb
pxor $t2,$sigma
`"prefetch 63(@ptr[1])" if ($i==15)`
movdqa $a,$t2
pxor $t3,$sigma # Sigma1(e)
movdqa $a,$t3
...
...
@@ -153,6 +156,7 @@ $code.=<<___;
pslld \$10,$t3
pxor $a,$axb # a^b, b^c in next round
`"prefetch 63(@ptr[2])" if ($i==15)`
psrld \$13,$sigma
pxor $t3,$t2
paddd $t1,$Xi # Xi+=Ch(e,f,g)
...
...
@@ -160,6 +164,7 @@ $code.=<<___;
pand $axb,$bxc
pxor $sigma,$t2
`"prefetch 63(@ptr[3])" if ($i==15)`
psrld \$22-13,$sigma
pxor $t3,$t2
movdqa $b,$h
...
...
@@ -465,30 +470,38 @@ $code.=<<___;
vpsrld \$25,$e,$t2
vpxor $t3,$sigma,$sigma
`"prefetch 63(@ptr[0])" if ($i==15)`
vpslld \$7,$e,$t3
vpandn $g,$e,$t1
vpand $f,$e,$axb # borrow $axb
`"prefetch 63(@ptr[1])" if ($i==15)`
vpxor $t2,$sigma,$sigma
vpsrld \$2,$a,$h # borrow $h
vpxor $t3,$sigma,$sigma # Sigma1(e)
`"prefetch 63(@ptr[2])" if ($i==15)`
vpslld \$30,$a,$t2
vpxor $axb,$t1,$t1 # Ch(e,f,g)
vpxor $a,$b,$axb # a^b, b^c in next round
`"prefetch 63(@ptr[3])" if ($i==15)`
vpxor $t2,$h,$h
vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
vpsrld \$13,$a,$t2
`"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
vpslld \$19,$a,$t3
vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
vpand $axb,$bxc,$bxc
`"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$h,$sigma
vpsrld \$22,$a,$t2
vpxor $t3,$sigma,$sigma
`"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
vpslld \$10,$a,$t3
vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
vpaddd $Xi,$d,$d # d+=Xi
`"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$sigma,$sigma
vpxor $t3,$sigma,$sigma # Sigma0(a)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录