提交 5599c733 编写于 作者: A Andy Polyakov

aes/asm/aesni-x86_64.pl: further optimization for Atom Silvermont.

Improve CBC decrypt and CTR by ~13/16%, which adds up to ~25/33%
improvement over "pre-Silvermont" version. [Add performance table to
aesni-x86.pl].
上级 385b3486
......@@ -43,6 +43,17 @@
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS ECB
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91
# Haswell 4.44/0.80 0.97 1.03 0.72
# Atom 5.77/3.56 3.67 4.03 3.46
# Bulldozer 5.80/0.98 1.05 1.24 0.93
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-586.pl:-)
......
......@@ -158,25 +158,19 @@
# in CTR mode AES instruction interleave factor was chosen to be 6x.
######################################################################
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
# instruction latency is 9 cycles and that they can be issued every
# cycle.
######################################################################
# Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC
# decrypt, CTR and ECB, 0.73 in XTS.
######################################################################
# Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt,
# 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable
# modes [other than XTS] are actually suboptimal, because of penalties
# incurred by operations on %xmm8-15, which are inevitable with such
# high instruction interleave factors. This means that performance can
# be improved by decreasing the interleave factor, but then it would
# negatively affect other platforms in relatively larger degree.
# Run-time detection would solve the dilemma...
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS ECB
# Westmere 3.77/1.25 1.25 1.25 1.26
# * Bridge 5.07/0.74 0.75 0.90 0.85
# Haswell 4.44/0.63 0.63 0.73 0.63
# Atom 5.75/3.54 3.56 4.12 3.87(*)
# Bulldozer 5.77/0.70 0.72 0.90 0.70
#
# (*) Atom ECB result is suboptimal because of penalties incurred
# by operations on %xmm8-15. As ECB is not considered
# critical, nothing was done to mitigate the problem.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
......@@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code=".text\n";
$code.=".extern OPENSSL_ia32cap_P\n";
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
......@@ -1119,7 +1114,9 @@ $code.=<<___;
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
$movkey 0x10($key),$rndkey1
......@@ -1130,10 +1127,104 @@ $code.=<<___;
cmp \$8,$len
jb .Lctr32_tail
sub \$6,$len
cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
je .Lctr32_6x
lea 0x80($key),$key # size optimization
sub \$8,$len
sub \$2,$len
jmp .Lctr32_loop8
.align 16
.Lctr32_6x:
shl \$4,$rounds
mov \$48,$rnds_
bswap $key0
lea 32($key,$rounds),$key # end of key schedule
sub %rax,%r10 # twisted $rounds
jmp .Lctr32_loop6
.align 16
.Lctr32_loop6:
add \$6,$ctr
$movkey -48($key,$rnds_),$rndkey0
aesenc $rndkey1,$inout0
mov $ctr,%eax
xor $key0,%eax
aesenc $rndkey1,$inout1
movbe %eax,`0x00+12`(%rsp)
lea 1($ctr),%eax
aesenc $rndkey1,$inout2
xor $key0,%eax
movbe %eax,`0x10+12`(%rsp)
aesenc $rndkey1,$inout3
lea 2($ctr),%eax
xor $key0,%eax
aesenc $rndkey1,$inout4
movbe %eax,`0x20+12`(%rsp)
lea 3($ctr),%eax
aesenc $rndkey1,$inout5
$movkey -32($key,$rnds_),$rndkey1
xor $key0,%eax
aesenc $rndkey0,$inout0
movbe %eax,`0x30+12`(%rsp)
lea 4($ctr),%eax
aesenc $rndkey0,$inout1
xor $key0,%eax
movbe %eax,`0x40+12`(%rsp)
aesenc $rndkey0,$inout2
lea 5($ctr),%eax
xor $key0,%eax
aesenc $rndkey0,$inout3
movbe %eax,`0x50+12`(%rsp)
mov %r10,%rax # mov $rnds_,$rounds
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey -16($key,$rnds_),$rndkey0
call .Lenc_loop6
movdqu ($inp),$inout6
movdqu 0x10($inp),$inout7
movdqu 0x20($inp),$in0
movdqu 0x30($inp),$in1
movdqu 0x40($inp),$in2
movdqu 0x50($inp),$in3
lea 0x60($inp),$inp
$movkey -64($key,$rnds_),$rndkey1
pxor $inout0,$inout6
movaps 0x00(%rsp),$inout0
pxor $inout1,$inout7
movaps 0x10(%rsp),$inout1
pxor $inout2,$in0
movaps 0x20(%rsp),$inout2
pxor $inout3,$in1
movaps 0x30(%rsp),$inout3
pxor $inout4,$in2
movaps 0x40(%rsp),$inout4
pxor $inout5,$in3
movaps 0x50(%rsp),$inout5
movdqu $inout6,($out)
movdqu $inout7,0x10($out)
movdqu $in0,0x20($out)
movdqu $in1,0x30($out)
movdqu $in2,0x40($out)
movdqu $in3,0x50($out)
lea 0x60($out),$out
sub \$6,$len
jnc .Lctr32_loop6
add \$6,$len
jz .Lctr32_done
lea -48($rnds_),$rounds
lea -80($key,$rnds_),$key # restore $key
neg $rounds
shr \$4,$rounds # restore $rounds
jmp .Lctr32_tail
.align 32
.Lctr32_loop8:
add \$8,$ctr
......@@ -2455,10 +2546,15 @@ $code.=<<___;
movdqa $inout3,$in3
movdqu 0x50($inp),$inout5
movdqa $inout4,$in4
mov OPENSSL_ia32cap_P+4(%rip),%r9d
cmp \$0x70,$len
jbe .Lcbc_dec_six_or_seven
sub \$0x70,$len
and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
sub \$0x50,$len
cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
je .Lcbc_dec_loop6_enter
sub \$0x20,$len
lea 0x70($key),$key # size optimization
jmp .Lcbc_dec_loop8_enter
.align 16
......@@ -2638,6 +2734,51 @@ $code.=<<___;
movdqa $inout6,$inout0
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_loop6:
movups $inout5,($out)
lea 0x10($out),$out
movdqu 0x00($inp),$inout0 # load input
movdqu 0x10($inp),$inout1
movdqa $inout0,$in0
movdqu 0x20($inp),$inout2
movdqa $inout1,$in1
movdqu 0x30($inp),$inout3
movdqa $inout2,$in2
movdqu 0x40($inp),$inout4
movdqa $inout3,$in3
movdqu 0x50($inp),$inout5
movdqa $inout4,$in4
.Lcbc_dec_loop6_enter:
lea 0x60($inp),$inp
movdqa $inout5,$inout6
call _aesni_decrypt6
pxor $iv,$inout0 # ^= IV
movdqa $inout6,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
pxor $in3,$inout4
mov $key_,$key
movdqu $inout3,0x30($out)
pxor $in4,$inout5
mov $rnds_,$rounds
movdqu $inout4,0x40($out)
lea 0x50($out),$out
sub \$0x60,$len
ja .Lcbc_dec_loop6
movdqa $inout5,$inout0
add \$0x50,$len
jle .Lcbc_dec_tail_collected
movups $inout5,($out)
lea 0x10($out),$out
.Lcbc_dec_tail:
movups ($inp),$inout0
sub \$0x10,$len
......@@ -3360,8 +3501,14 @@ sub aesni {
return $line;
}
sub movbe {
".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
print $code;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册