提交 f8501464 编写于 作者: A Andy Polyakov

aesni-x86[_64].pl: optimize for Sandy Bridge and add XTS mode.

上级 96abea33
......@@ -27,7 +27,21 @@
# Lower ratios for smaller block sizes are perfectly understandable,
# because function call overhead is higher in 32-bit mode. Largest
# 8-KB block performance is virtually same: 32-bit code is less than
# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
# January 2011
#
# See aesni-x86_64.pl for details. Unlike x86_64 version this module
# interleaves at most 6 aes[enc|dec] instructions, because there are
# not enough registers for 8x interleave [which should be optimal for
# Sandy Bridge]. Actually, performance results for 6x interleave
# factor presented in aesni-x86_64.pl (except for CTR) are for this
# module.
# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
......@@ -51,14 +65,14 @@ $out="edi";
$rounds_="ebx"; # backup copy for $rounds
$key_="ebp"; # backup copy for $key
$inout0="xmm0";
$inout1="xmm1";
$inout2="xmm2";
$rndkey0="xmm3";
$rndkey1="xmm4";
$ivec="xmm5";
$in0="xmm6";
$in1="xmm7"; $inout3="xmm7";
$rndkey0="xmm0";
$rndkey1="xmm1";
$inout0="xmm2";
$inout1="xmm3";
$inout2="xmm4";
$inout3="xmm5"; $in1="xmm5";
$inout4="xmm6"; $in0="xmm6";
$inout5="xmm7"; $ivec="xmm7";
# AESNI extenstion
sub aeskeygenassist
......@@ -80,13 +94,15 @@ sub aesdeclast { aescommon(0xdf,@_); }
# Inline version of internal aesni_[en|de]crypt1
{ my $sn;
sub aesni_inline_generate1
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
$sn++;
&movdqu ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($ivec,$rndkey0) if (defined($ivec));
&lea ($key,&DWP(32,$key));
&pxor ($inout,$rndkey0);
&xorps ($inout,$ivec) if (defined($ivec));
&xorps ($inout,$rndkey0) if (!defined($ivec));
&set_label("${p}1_loop_$sn");
eval"&aes${p} ($inout,$rndkey1)";
&dec ($rounds);
......@@ -100,9 +116,9 @@ sub aesni_generate1 # fully unrolled loop
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
&function_begin_B("_aesni_${p}rypt1");
&movdqu ($rndkey0,&QWP(0,$key));
&movups ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(0x10,$key));
&pxor ($inout,$rndkey0);
&xorps ($inout,$rndkey0);
&$movekey ($rndkey0,&QWP(0x20,$key));
&lea ($key,&DWP(0x30,$key));
&cmp ($rounds,11);
......@@ -147,7 +163,7 @@ sub aesni_generate1 # fully unrolled loop
&function_begin_B("${PREFIX}_encrypt");
&mov ("eax",&wparam(0));
&mov ($key,&wparam(2));
&movdqu ($inout0,&QWP(0,"eax"));
&movups ($inout0,&QWP(0,"eax"));
&mov ($rounds,&DWP(240,$key));
&mov ("eax",&wparam(1));
if ($inline)
......@@ -163,7 +179,7 @@ sub aesni_generate1 # fully unrolled loop
&function_begin_B("${PREFIX}_decrypt");
&mov ("eax",&wparam(0));
&mov ($key,&wparam(2));
&movdqu ($inout0,&QWP(0,"eax"));
&movups ($inout0,&QWP(0,"eax"));
&mov ($rounds,&DWP(240,$key));
&mov ("eax",&wparam(1));
if ($inline)
......@@ -174,16 +190,19 @@ sub aesni_generate1 # fully unrolled loop
&ret ();
&function_end_B("${PREFIX}_decrypt");
# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
# latency is 6, it turned out that it can be scheduled only every
# *second* cycle. Thus 3x interleave is the one providing optimal
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it makes no sense to implement 2x subroutine. As soon
# as/if Intel improves throughput by making it possible to schedule
# the instructions in question *every* cycles I would have to
# implement 6x interleave and use it in loop...
# This is why it makes no sense to implement 2x subroutine.
# aes[enc|dec] latency in next processor generation is 8, but the
# instructions can be scheduled every cycle. Optimal interleave for
# new processor is therefore 8x, but it's unfeasible to accommodate it
# in XMM registers addreassable in 32-bit mode and therefore 6x is
# used instead...
sub aesni_generate3
{ my $p=shift;
......@@ -192,7 +211,7 @@ sub aesni_generate3
&shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key));
&lea ($key,&DWP(32,$key));
&pxor ($inout0,$rndkey0);
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key));
......@@ -231,13 +250,13 @@ sub aesni_generate4
&$movekey ($rndkey1,&QWP(16,$key));
&shr ($rounds,1);
&lea ($key,&DWP(32,$key));
&pxor ($inout0,$rndkey0);
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key));
&set_label("${p}3_loop");
&set_label("${p}4_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
&dec ($rounds);
......@@ -250,7 +269,7 @@ sub aesni_generate4
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
&$movekey ($rndkey0,&QWP(0,$key));
&jnz (&label("${p}3_loop"));
&jnz (&label("${p}4_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
......@@ -263,10 +282,73 @@ sub aesni_generate4
&ret();
&function_end_B("_aesni_${p}rypt4");
}
sub aesni_generate6
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt6");
&static_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey0,&QWP(0,$key));
&shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key));
&lea ($key,&DWP(32,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0); # pxor does better here
eval"&aes${p} ($inout0,$rndkey1)";
&pxor ($inout2,$rndkey0);
eval"&aes${p} ($inout1,$rndkey1)";
&pxor ($inout3,$rndkey0);
&dec ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout4,$rndkey0);
eval"&aes${p} ($inout3,$rndkey1)";
&pxor ($inout5,$rndkey0);
eval"&aes${p} ($inout4,$rndkey1)";
&$movekey ($rndkey0,&QWP(0,$key));
eval"&aes${p} ($inout5,$rndkey1)";
&jmp (&label("_aesni_${p}rypt6_enter"));
&set_label("${p}6_loop",16);
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
&dec ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
&set_label("_aesni_${p}rypt6_enter",16);
&$movekey ($rndkey1,&QWP(16,$key));
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
&lea ($key,&DWP(32,$key));
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
eval"&aes${p} ($inout4,$rndkey0)";
eval"&aes${p} ($inout5,$rndkey0)";
&$movekey ($rndkey0,&QWP(0,$key));
&jnz (&label("${p}6_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
eval"&aes${p}last ($inout3,$rndkey0)";
eval"&aes${p}last ($inout4,$rndkey0)";
eval"&aes${p}last ($inout5,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt6");
}
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
&aesni_generate6("enc") if ($PREFIX eq "aesni");
&aesni_generate6("dec");
if ($PREFIX eq "aesni") {
######################################################################
......@@ -278,37 +360,62 @@ if ($PREFIX eq "aesni") {
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&mov ($rounds,&wparam(4));
&cmp ($len,16);
&jb (&label("ecb_ret"));
&mov ($rounds_,&wparam(4));
&and ($len,-16);
&test ($rounds,$rounds)
&jz (&label("ecb_ret"));
&mov ($rounds,&DWP(240,$key));
&test ($rounds_,$rounds_);
&jz (&label("ecb_decrypt"));
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&jz (&label("ecb_decrypt"));
&cmp ($len,0x60);
&jb (&label("ecb_enc_tail"));
&movdqu ($inout0,&QWP(0,$inp));
&movdqu ($inout1,&QWP(0x10,$inp));
&movdqu ($inout2,&QWP(0x20,$inp));
&movdqu ($inout3,&QWP(0x30,$inp));
&movdqu ($inout4,&QWP(0x40,$inp));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&sub ($len,0x60);
&jmp (&label("ecb_enc_loop6_enter"));
&set_label("ecb_enc_loop6",16);
&movups (&QWP(0,$out),$inout0);
&movdqu ($inout0,&QWP(0,$inp));
&movups (&QWP(0x10,$out),$inout1);
&movdqu ($inout1,&QWP(0x10,$inp));
&movups (&QWP(0x20,$out),$inout2);
&movdqu ($inout2,&QWP(0x20,$inp));
&movups (&QWP(0x30,$out),$inout3);
&movdqu ($inout3,&QWP(0x30,$inp));
&movups (&QWP(0x40,$out),$inout4);
&movdqu ($inout4,&QWP(0x40,$inp));
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&set_label("ecb_enc_loop6_enter");
&cmp ($len,0x40);
&jbe (&label("ecb_enc_tail"));
&sub ($len,0x40);
&jmp (&label("ecb_enc_loop3"));
&call ("_aesni_encrypt6");
&set_label("ecb_enc_loop3",16);
&movups ($inout0,&QWP(0,$inp));
&movups ($inout1,&QWP(0x10,$inp));
&movups ($inout2,&QWP(0x20,$inp));
&call ("_aesni_encrypt3");
&lea ($inp,&DWP(0x30,$inp));
&movups (&QWP(0,$out),$inout0);
&mov ($key,$key_); # restore $key
&movups (&QWP(0x10,$out),$inout1);
&mov ($rounds,$rounds_); # restore $rounds
&sub ($len,0x60);
&jnc (&label("ecb_enc_loop6"));
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&lea ($out,&DWP(0x30,$out));
&sub ($len,0x30);
&ja (&label("ecb_enc_loop3"));
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&add ($len,0x60);
&jz (&label("ecb_ret"));
&add ($len,0x40);
&set_label("ecb_enc_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x20);
......@@ -316,14 +423,18 @@ if ($PREFIX eq "aesni") {
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_enc_two"));
&movups ($inout2,&QWP(0x20,$inp));
&cmp ($len,0x30);
&je (&label("ecb_enc_three"));
&cmp ($len,0x40);
&jb (&label("ecb_enc_three"));
&movups ($inout3,&QWP(0x30,$inp));
&call ("_aesni_encrypt4");
&je (&label("ecb_enc_four"));
&movups ($inout4,&QWP(0x40,$inp));
&xorps ($inout5,$inout5);
&call ("_aesni_encrypt6");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
jmp (&label("ecb_ret"));
&set_label("ecb_enc_one",16);
......@@ -335,7 +446,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_two",16);
&pxor ($inout2,$inout2);
&xorps ($inout2,$inout2);
&call ("_aesni_encrypt3");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
......@@ -347,29 +458,65 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_four",16);
&call ("_aesni_encrypt4");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&jmp (&label("ecb_ret"));
######################################################################
&set_label("ecb_decrypt",16);
&cmp ($len,0x40);
&jbe (&label("ecb_dec_tail"));
&sub ($len,0x40);
&jmp (&label("ecb_dec_loop3"));
&set_label("ecb_dec_loop3",16);
&movups ($inout0,&QWP(0,$inp));
&movups ($inout1,&QWP(0x10,$inp));
&movups ($inout2,&QWP(0x20,$inp));
&call ("_aesni_decrypt3");
&lea ($inp,&DWP(0x30,$inp));
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&cmp ($len,0x60);
&jb (&label("ecb_dec_tail"));
&movdqu ($inout0,&QWP(0,$inp));
&movdqu ($inout1,&QWP(0x10,$inp));
&movdqu ($inout2,&QWP(0x20,$inp));
&movdqu ($inout3,&QWP(0x30,$inp));
&movdqu ($inout4,&QWP(0x40,$inp));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&sub ($len,0x60);
&jmp (&label("ecb_dec_loop6_enter"));
&set_label("ecb_dec_loop6",16);
&movups (&QWP(0,$out),$inout0);
&mov ($key,$key_); # restore $key
&movdqu ($inout0,&QWP(0,$inp));
&movups (&QWP(0x10,$out),$inout1);
&movdqu ($inout1,&QWP(0x10,$inp));
&movups (&QWP(0x20,$out),$inout2);
&movdqu ($inout2,&QWP(0x20,$inp));
&movups (&QWP(0x30,$out),$inout3);
&movdqu ($inout3,&QWP(0x30,$inp));
&movups (&QWP(0x40,$out),$inout4);
&movdqu ($inout4,&QWP(0x40,$inp));
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&set_label("ecb_dec_loop6_enter");
&call ("_aesni_decrypt6");
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&sub ($len,0x60);
&jnc (&label("ecb_dec_loop6"));
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&lea ($out,&DWP(0x30,$out));
&sub ($len,0x30);
&ja (&label("ecb_dec_loop3"));
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&add ($len,0x60);
&jz (&label("ecb_ret"));
&add ($len,0x40);
&set_label("ecb_dec_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x20);
......@@ -377,14 +524,18 @@ if ($PREFIX eq "aesni") {
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_dec_two"));
&movups ($inout2,&QWP(0x20,$inp));
&cmp ($len,0x30);
&je (&label("ecb_dec_three"));
&cmp ($len,0x40);
&jb (&label("ecb_dec_three"));
&movups ($inout3,&QWP(0x30,$inp));
&call ("_aesni_decrypt4");
&je (&label("ecb_dec_four"));
&movups ($inout4,&QWP(0x40,$inp));
&xorps ($inout5,$inout5);
&call ("_aesni_decrypt6");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_one",16);
......@@ -396,7 +547,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_two",16);
&pxor ($inout2,$inout2);
&xorps ($inout2,$inout2);
&call ("_aesni_decrypt3");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
......@@ -407,6 +558,14 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_four",16);
&call ("_aesni_decrypt4");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&set_label("ecb_ret");
&function_end("aesni_ecb_encrypt");
......@@ -420,6 +579,7 @@ if ($PREFIX eq "aesni") {
# does not update *ivec! Nor does it finalize CMAC value
# (see engine/eng_aesni.c for details)
#
{ my $cmac=$inout1;
&function_begin("aesni_ccm64_encrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
......@@ -433,7 +593,7 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(48,"esp"),$key_);
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
&movdqu ($inout1,&QWP(0,$rounds)); # load cmac
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
......@@ -458,27 +618,47 @@ if ($PREFIX eq "aesni") {
&movdqa ($inout0,$ivec);
&set_label("ccm64_enc_outer");
&movdqu ($in0,&QWP(0,$inp));
&pshufb ($inout0,$inout3);
&mov ($key,$key_);
&mov ($rounds,$rounds_);
&pxor ($inout1,$in0); # cmac^=inp
&pxor ($inout2,$inout2);
&movups ($in0,&QWP(0,$inp));
&pshufb ($inout0,$inout3);
&mov ($key,$key_);
&mov ($rounds,$rounds_);
&call ("_aesni_encrypt3");
&$movekey ($rndkey0,&QWP(0,$key));
&shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($in0,$rndkey0);
&lea ($key,&DWP(32,$key));
&xorps ($inout0,$rndkey0);
&xorps ($cmac,$in0); # cmac^=inp
&$movekey ($rndkey0,&QWP(0,$key));
&set_label("ccm64_enc2_loop");
&aesenc ($inout0,$rndkey1);
&dec ($rounds);
&aesenc ($cmac,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key));
&aesenc ($inout0,$rndkey0);
&lea ($key,&DWP(32,$key));
&aesenc ($cmac,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key));
&jnz (&label("ccm64_enc2_loop"));
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
&aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0);
&paddq ($ivec,&QWP(16,"esp"));
&dec ($len);
&lea ($inp,&DWP(16,$inp));
&pxor ($in0,$inout0); # inp^=E(ivec)
&xorps ($in0,$inout0); # inp^=E(ivec)
&movdqa ($inout0,$ivec);
&movdqu (&QWP(0,$out),$in0);
&movups (&QWP(0,$out),$in0);
&lea ($out,&DWP(16,$out));
&jnz (&label("ccm64_enc_outer"));
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movdqu (&QWP(0,$out),$inout1);
&movups (&QWP(0,$out),$cmac);
&function_end("aesni_ccm64_encrypt_blocks");
&function_begin("aesni_ccm64_decrypt_blocks");
......@@ -494,7 +674,7 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(48,"esp"),$key_);
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
&movdqu ($inout1,&QWP(0,$rounds)); # load cmac
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
......@@ -524,35 +704,56 @@ if ($PREFIX eq "aesni") {
{ &call ("_aesni_encrypt1"); }
&set_label("ccm64_dec_outer");
&movdqu ($in0,&QWP(0,$inp));
&paddq ($ivec,&QWP(16,"esp"));
&dec ($len);
&lea ($inp,&QWP(16,$inp));
&pxor ($in0,$inout0);
&movups ($in0,&QWP(0,$inp)); # load inp
&xorps ($in0,$inout0);
&movdqa ($inout0,$ivec);
&lea ($inp,&QWP(16,$inp));
&pshufb ($inout0,$inout3);
&mov ($key,$key_);
&mov ($rounds,$rounds_);
&pshufb ($inout0,$inout3);
&movdqu (&QWP(0,$out),$in0);
&movups (&QWP(0,$out),$in0);
&lea ($out,&DWP(16,$out));
&sub ($len,1);
&jz (&label("ccm64_dec_break"));
&pxor ($inout2,$inout2);
&call ("_aesni_encrypt3");
&$movekey ($rndkey0,&QWP(0,$key));
&shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($in0,$rndkey0);
&lea ($key,&DWP(32,$key));
&xorps ($inout0,$rndkey0);
&xorps ($cmac,$in0); # cmac^=out
&$movekey ($rndkey0,&QWP(0,$key));
&set_label("ccm64_dec2_loop");
&aesenc ($inout0,$rndkey1);
&dec ($rounds);
&aesenc ($cmac,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key));
&aesenc ($inout0,$rndkey0);
&lea ($key,&DWP(32,$key));
&aesenc ($cmac,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key));
&jnz (&label("ccm64_dec2_loop"));
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
&aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0);
&jmp (&label("ccm64_dec_outer"));
&set_label("ccm64_dec_break",16);
if ($inline)
{ &aesni_inline_generate1("enc",$inout1); }
{ &aesni_inline_generate1("enc",$cmac,$in0); }
else
{ &call ("_aesni_encrypt1",$inout1); }
{ &call ("_aesni_encrypt1",$cmac); }
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movdqu (&QWP(0,$out),$inout1);
&movups (&QWP(0,$out),$cmac);
&function_end("aesni_ccm64_decrypt_blocks");
}
######################################################################
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
......@@ -562,6 +763,14 @@ if ($PREFIX eq "aesni") {
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
#
# stack layout:
# 0 pshufb mask
# 16 vector addend: 0,6,6,6
# 32 counter-less ivec
# 48 1st triplet of counter vector
# 64 2nd triplet of counter vector
# 80 saved %esp
&function_begin("aesni_ctr32_encrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
......@@ -569,14 +778,14 @@ if ($PREFIX eq "aesni") {
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&mov ($key_,"esp");
&sub ("esp",60);
&sub ("esp",88);
&and ("esp",-16); # align stack
&mov (&DWP(48,"esp"),$key_);
&mov (&DWP(80,"esp"),$key_);
&cmp ($len,1);
&je (&label("ctr32_one_shortcut"));
&movups ($inout3,&QWP(0,$rounds_)); # load ivec
&movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
......@@ -585,139 +794,167 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack
&mov ($rounds,3);
&mov ($rounds,6);
&xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds);
&mov (&DWP(20,"esp"),$rounds);
&mov (&DWP(24,"esp"),$rounds);
&mov (&DWP(28,"esp"),$key_);
&pextrd ($rounds_,$inout3,3); # pull 32-bit counter
&pinsrd ($inout3,$key_,3); # wipe 32-bit counter
&pextrd ($rounds_,$inout5,3); # pull 32-bit counter
&pinsrd ($inout5,$key_,3); # wipe 32-bit counter
&mov ($rounds,&DWP(240,$key)); # key->rounds
&movdqa ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
# $ivec is vector of 3 32-bit counters
&pxor ($ivec,$ivec);
# compose 2 vectors of 3x32-bit counters
&bswap ($rounds_);
&pinsrd ($ivec,$rounds_,0);
&pxor ($rndkey1,$rndkey1);
&pxor ($rndkey0,$rndkey0);
&movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
&pinsrd ($rndkey1,$rounds_,0);
&lea ($key_,&DWP(3,$rounds_));
&pinsrd ($rndkey0,$key_,0);
&inc ($rounds_);
&pinsrd ($ivec,$rounds_,1);
&pinsrd ($rndkey1,$rounds_,1);
&inc ($key_);
&pinsrd ($rndkey0,$key_,1);
&inc ($rounds_);
&pinsrd ($ivec,$rounds_,2);
&pshufb ($ivec,$rndkey0); # byte swap
&cmp ($len,4);
&jbe (&label("ctr32_tail"));
&movdqa (&QWP(32,"esp"),$inout3); # save counter-less ivec
&mov ($rounds_,$rounds);
&mov ($key_,$key);
&sub ($len,4);
&jmp (&label("ctr32_loop3"));
&set_label("ctr32_loop3",16);
&pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
&pshufd ($inout1,$ivec,2<<6);
&por ($inout0,$inout3); # merge counter-less ivec
&pshufd ($inout2,$ivec,1<<6);
&por ($inout1,$inout3);
&por ($inout2,$inout3);
# inline _aesni_encrypt3 and interleave last round
# with own code...
&$movekey ($rndkey0,&QWP(0,$key));
&shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key));
&lea ($key,&DWP(32,$key));
&pinsrd ($rndkey1,$rounds_,2);
&inc ($key_);
&pinsrd ($rndkey0,$key_,2);
&movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
&pshufb ($rndkey1,$inout0); # byte swap
&movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
&pshufb ($rndkey0,$inout0); # byte swap
&pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
&pshufd ($inout1,$rndkey1,2<<6);
&cmp ($len,6);
&jb (&label("ctr32_tail"));
&movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
&shr ($rounds,1);
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&sub ($len,6);
&jmp (&label("ctr32_loop6"));
&set_label("ctr32_loop6",16);
&pshufd ($inout2,$rndkey1,1<<6);
&movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
&pshufd ($inout3,$rndkey0,3<<6);
&por ($inout0,$rndkey1); # merge counter-less ivec
&pshufd ($inout4,$rndkey0,2<<6);
&por ($inout1,$rndkey1);
&pshufd ($inout5,$rndkey0,1<<6);
&por ($inout2,$rndkey1);
&por ($inout3,$rndkey1);
&por ($inout4,$rndkey1);
&por ($inout5,$rndkey1);
# inlining _aesni_encrypt6's prologue gives ~4% improvement...
&$movekey ($rndkey0,&QWP(0,$key_));
&$movekey ($rndkey1,&QWP(16,$key_));
&lea ($key,&DWP(32,$key_));
&dec ($rounds);
&pxor ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key));
&set_label("ctr32_enc_loop3");
&aesenc ($inout0,$rndkey1);
&pxor ($inout2,$rndkey0);
&aesenc ($inout1,$rndkey1);
&dec ($rounds);
&pxor ($inout3,$rndkey0);
&aesenc ($inout2,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key));
&aesenc ($inout0,$rndkey0);
&aesenc ($inout1,$rndkey0);
&lea ($key,&DWP(32,$key));
&aesenc ($inout2,$rndkey0);
&pxor ($inout4,$rndkey0);
&aesenc ($inout3,$rndkey1);
&pxor ($inout5,$rndkey0);
&aesenc ($inout4,$rndkey1);
&$movekey ($rndkey0,&QWP(0,$key));
&jnz (&label("ctr32_enc_loop3"));
&aesenc ($inout5,$rndkey1);
&aesenc ($inout0,$rndkey1);
&aesenc ($inout1,$rndkey1);
&aesenc ($inout2,$rndkey1);
&movdqa ($rndkey1,&QWP(0,"esp")); # load byte-swap mask
&call (&label("_aesni_encrypt6_enter"));
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movdqa ($rndkey0,&QWP(16,"esp")); # load increment
&xorps ($inout2,$rndkey1);
&movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&paddd ($rndkey1,$rndkey0); # 1st triplet increment
&paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
&movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
&movups ($inout1,&QWP(0x30,$inp));
&movups ($inout2,&QWP(0x40,$inp));
&xorps ($inout3,$inout1);
&movups ($inout1,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
&pshufb ($rndkey1,$inout0); # byte swap
&xorps ($inout4,$inout2);
&movups (&QWP(0x30,$out),$inout3);
&xorps ($inout5,$inout1);
&movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
&pshufb ($rndkey0,$inout0); # byte swap
&movups (&QWP(0x40,$out),$inout4);
&pshufd ($inout0,$rndkey1,3<<6);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&aesenclast ($inout0,$rndkey0);
&pshufb ($ivec,$rndkey1); # byte swap
&movdqu ($in0,&QWP(0,$inp));
&aesenclast ($inout1,$rndkey0);
&paddd ($ivec,&QWP(16,"esp")); # counter increment
&movdqu ($in1,&QWP(0x10,$inp));
&aesenclast ($inout2,$rndkey0);
&pshufb ($ivec,$rndkey1); # byte swap
&movdqu ($rndkey0,&QWP(0x20,$inp));
&lea ($inp,&DWP(0x30,$inp));
&pxor ($in0,$inout0);
&mov ($key,$key_);
&pxor ($in1,$inout1);
&movdqu (&QWP(0,$out),$in0);
&pxor ($rndkey0,$inout2);
&movdqu (&QWP(0x10,$out),$in1);
&movdqu (&QWP(0x20,$out),$rndkey0);
&movdqa ($inout3,&QWP(32,"esp")); # load counter-less ivec
&sub ($len,3);
&lea ($out,&DWP(0x30,$out));
&mov ($rounds,$rounds_);
&ja (&label("ctr32_loop3"));
&pshufd ($inout1,$rndkey1,2<<6);
&sub ($len,6);
&jnc (&label("ctr32_loop6"));
&pextrd ($rounds_,$ivec,1); # might need last counter value
&add ($len,4);
&bswap ($rounds_);
&add ($len,6);
&jz (&label("ctr32_ret"));
&mov ($key,$key_);
&lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
&movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
&set_label("ctr32_tail");
&pshufd ($inout0,$ivec,3<<6);
&pshufd ($inout1,$ivec,2<<6);
&por ($inout0,$inout3);
&por ($inout0,$inout5);
&cmp ($len,2);
&jb (&label("ctr32_one"));
&lea ($rounds_,&DWP(1,$rounds_));
&pshufd ($inout2,$ivec,1<<6);
&por ($inout1,$inout3);
&je (&label("ctr32_two"));
&bswap ($rounds_);
&por ($inout2,$inout3);
&cmp ($len,3);
&je (&label("ctr32_three"));
&pinsrd ($inout3,$rounds_,3); # compose last counter value
&call ("_aesni_encrypt4");
&pshufd ($inout2,$rndkey1,1<<6);
&por ($inout1,$inout5);
&je (&label("ctr32_two"));
&movdqu ($in0,&QWP(0,$inp));
&movdqu ($rndkey1,&QWP(0x10,$inp));
&pxor ($in0,$inout0);
&movdqu ($rndkey0,&QWP(0x20,$inp));
&pxor ($rndkey1,$inout1);
&movdqu ($ivec,&QWP(0x30,$inp));
&pxor ($rndkey0,$inout2);
&movdqu (&QWP(0,$out),$in0);
&pxor ($ivec,$inout3);
&movdqu (&QWP(0x10,$out),$rndkey1);
&movdqu (&QWP(0x20,$out),$rndkey0);
&movdqu (&QWP(0x30,$out),$ivec);
&pshufd ($inout3,$rndkey0,3<<6);
&por ($inout2,$inout5);
&cmp ($len,4);
&jb (&label("ctr32_three"));
&pshufd ($inout4,$rndkey0,2<<6);
&por ($inout3,$inout5);
&je (&label("ctr32_four"));
&por ($inout4,$inout5);
&call ("_aesni_encrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout2,$rndkey1);
&movups ($rndkey1,&QWP(0x40,$inp));
&xorps ($inout3,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout4,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_one_shortcut",16);
&movdqu ($inout0,&QWP(0,$rounds_)); # load ivec
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
......@@ -725,37 +962,757 @@ if ($PREFIX eq "aesni") {
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&movdqu ($in0,&QWP(0,$inp));
&pxor ($in0,$inout0);
&movdqu (&QWP(0,$out),$in0);
&movups ($in0,&QWP(0,$inp));
&xorps ($in0,$inout0);
&movups (&QWP(0,$out),$in0);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_two",16);
&pxor ($inout2,$inout2);
&call ("_aesni_encrypt3");
&movdqu ($in0,&QWP(0,$inp));
&movdqu ($in1,&QWP(0x10,$inp));
&pxor ($in0,$inout0);
&pxor ($in1,$inout1);
&movdqu (&QWP(0,$out),$in0);
&movdqu (&QWP(0x10,$out),$in1);
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_three",16);
&call ("_aesni_encrypt3");
&movdqu ($in0,&QWP(0,$inp));
&movdqu ($in1,&QWP(0x10,$inp));
&movdqu ($rndkey1,&QWP(0x20,$inp));
&pxor ($in0,$inout0);
&pxor ($in1,$inout1);
&movdqu (&QWP(0,$out),$in0);
&pxor ($rndkey1,$inout2);
&movdqu (&QWP(0x10,$out),$in1);
&movdqu (&QWP(0x20,$out),$rndkey1);
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&movups ($inout5,&QWP(0x20,$inp));
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$inout5);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_four",16);
&call ("_aesni_encrypt4");
&movups ($inout4,&QWP(0,$inp));
&movups ($inout5,&QWP(0x10,$inp));
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout0,$inout4);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout1,$inout5);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&set_label("ctr32_ret");
&mov ("esp",&DWP(48,"esp"));
&mov ("esp",&DWP(80,"esp"));
&function_end("aesni_ctr32_encrypt_blocks");
######################################################################
# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2
# const unsigned char iv[16]);
#
{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
&function_begin("aesni_xts_encrypt");
&mov ($key,&wparam(4)); # key2
&mov ($inp,&wparam(5)); # clear-text tweak
&mov ($rounds,&DWP(240,$key)); # key2->rounds
&movups ($inout0,&QWP(0,$inp));
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3)); # key1
&mov ($key_,"esp");
&sub ("esp",16*7+8);
&mov ($rounds,&DWP(240,$key)); # key1->rounds
&and ("esp",-16); # align stack
&mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
&mov (&DWP(16*6+4,"esp"),0);
&mov (&DWP(16*6+8,"esp"),1);
&mov (&DWP(16*6+12,"esp"),0);
&mov (&DWP(16*7+0,"esp"),$len); # save original $len
&mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
&movdqa ($tweak,$inout0);
&pxor ($twtmp,$twtmp);
&movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&and ($len,-16);
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&sub ($len,16*6);
&jc (&label("xts_enc_short"));
&shr ($rounds,1);
&mov ($rounds_,$rounds);
&jmp (&label("xts_enc_loop6"));
&set_label("xts_enc_loop6",16);
for ($i=0;$i<4;$i++) {
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa (&QWP(16*$i,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
}
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*$i++,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&$movekey ($rndkey0,&QWP(0,$key_));
&pand ($inout5,$twmask); # isolate carry and residue
&movups ($inout0,&QWP(0,$inp)); # load input
&pxor ($inout5,$tweak);
# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
&movdqu ($inout1,&QWP(16*1,$inp));
&xorps ($inout0,$rndkey0); # input^=rndkey[0]
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout1,$rndkey0);
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout2,$rndkey0);
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout3,$rndkey0);
&movdqu ($rndkey1,&QWP(16*5,$inp));
&pxor ($inout4,$rndkey0);
&lea ($inp,&DWP(16*6,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
&pxor ($inout5,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key_));
&lea ($key,&DWP(32,$key_));
&pxor ($inout1,&QWP(16*1,"esp"));
&aesenc ($inout0,$rndkey1);
&pxor ($inout2,&QWP(16*2,"esp"));
&aesenc ($inout1,$rndkey1);
&pxor ($inout3,&QWP(16*3,"esp"));
&dec ($rounds);
&aesenc ($inout2,$rndkey1);
&pxor ($inout4,&QWP(16*4,"esp"));
&aesenc ($inout3,$rndkey1);
&pxor ($inout5,$rndkey0);
&aesenc ($inout4,$rndkey1);
&$movekey ($rndkey0,&QWP(0,$key));
&aesenc ($inout5,$rndkey1);
&call (&label("_aesni_encrypt6_enter"));
&movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
&pxor ($twtmp,$twtmp);
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&xorps ($inout1,&QWP(16*1,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*2,$out),$inout2);
&xorps ($inout4,&QWP(16*4,"esp"));
&movups (&QWP(16*3,$out),$inout3);
&xorps ($inout5,$tweak);
&movups (&QWP(16*4,$out),$inout4);
&pshufd ($twres,$twtmp,0x13);
&movups (&QWP(16*5,$out),$inout5);
&lea ($out,&DWP(16*6,$out));
&movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&mov ($rounds,$rounds_); # restore $rounds
&pxor ($tweak,$twres);
&sub ($len,16*6);
&jnc (&label("xts_enc_loop6"));
&lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
&mov ($key,$key_); # restore $key
&mov ($rounds_,$rounds);
&set_label("xts_enc_short");
&add ($len,16*6);
&jz (&label("xts_enc_done6x"));
&movdqa ($inout3,$tweak); # put aside previous tweak
&cmp ($len,0x20);
&jb (&label("xts_enc_one"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&je (&label("xts_enc_two"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout4,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&cmp ($len,0x40);
&jb (&label("xts_enc_three"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout5,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&movdqa (&QWP(16*0,"esp"),$inout3);
&movdqa (&QWP(16*1,"esp"),$inout4);
&je (&label("xts_enc_four"));
&movdqa (&QWP(16*2,"esp"),$inout5);
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*3,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($inout0,1);
&pand ($inout5,$twmask); # isolate carry and residue
&pxor ($inout5,$tweak);
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout1,&QWP(16*1,"esp"));
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout2,&QWP(16*2,"esp"));
&lea ($inp,&DWP(16*5,$inp));
&pxor ($inout3,&QWP(16*3,"esp"));
&movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
&pxor ($inout4,$inout5);
&call ("_aesni_encrypt6");
&movaps ($tweak,&QWP(16*4,"esp")); # last tweak
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout4,$tweak);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&movups (&QWP(16*4,$out),$inout4);
&lea ($out,&DWP(16*5,$out));
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_one",16);
&movups ($inout0,&QWP(16*0,$inp)); # load input
&lea ($inp,&DWP(16*1,$inp));
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(16*0,$out),$inout0); # write output
&lea ($out,&DWP(16*1,$out));
&movdqa ($tweak,$inout3); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_two",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&lea ($inp,&DWP(16*2,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout2);
&call ("_aesni_encrypt3");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&lea ($out,&DWP(16*2,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_three",16);
&movaps ($inout5,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&lea ($inp,&DWP(16*3,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&call ("_aesni_encrypt3");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&lea ($out,&DWP(16*3,$out));
&movdqa ($tweak,$inout5); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_four",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movups ($inout3,&QWP(16*3,$inp));
&lea ($inp,&DWP(16*4,$inp));
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&xorps ($inout3,$inout4);
&call ("_aesni_encrypt4");
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,$inout4);
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&lea ($out,&DWP(16*4,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&and ($len,15);
&jz (&label("xts_enc_ret"));
&movdqa ($inout3,$tweak);
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&jmp (&label("xts_enc_steal"));
&set_label("xts_enc_done",16);
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&pxor ($twtmp,$twtmp);
&and ($len,15);
&jz (&label("xts_enc_ret"));
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&pshufd ($inout3,$twtmp,0x13);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
&pxor ($inout3,$tweak);
&set_label("xts_enc_steal");
&movz ($rounds,&BP(0,$inp));
&movz ($key,&BP(-16,$out));
&lea ($inp,&DWP(1,$inp));
&mov (&BP(-16,$out),&LB($rounds));
&mov (&BP(0,$out),&LB($key));
&lea ($out,&DWP(1,$out));
&sub ($len,1);
&jnz (&label("xts_enc_steal"));
&sub ($out,&DWP(16*7+0,"esp")); # rewind $out
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&movups ($inout0,&QWP(-16,$out)); # load input
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(-16,$out),$inout0); # write output
&set_label("xts_enc_ret");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_encrypt");
&function_begin("aesni_xts_decrypt");
&mov ($key,&wparam(4)); # key2
&mov ($inp,&wparam(5)); # clear-text tweak
&mov ($rounds,&DWP(240,$key)); # key2->rounds
&movups ($inout0,&QWP(0,$inp));
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3)); # key1
&mov ($key_,"esp");
&sub ("esp",16*7+8);
&and ("esp",-16); # align stack
&xor ($rounds_,$rounds_); # if(len%16) len-=16;
&test ($len,15);
&setnz (&LB($rounds_));
&shl ($rounds_,4);
&sub ($len,$rounds_);
&mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
&mov (&DWP(16*6+4,"esp"),0);
&mov (&DWP(16*6+8,"esp"),1);
&mov (&DWP(16*6+12,"esp"),0);
&mov (&DWP(16*7+0,"esp"),$len); # save original $len
&mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
&mov ($rounds,&DWP(240,$key)); # key1->rounds
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&movdqa ($tweak,$inout0);
&pxor ($twtmp,$twtmp);
&movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&and ($len,-16);
&sub ($len,16*6);
&jc (&label("xts_dec_short"));
&shr ($rounds,1);
&mov ($rounds_,$rounds);
&jmp (&label("xts_dec_loop6"));
&set_label("xts_dec_loop6",16);
for ($i=0;$i<4;$i++) {
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa (&QWP(16*$i,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
}
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*$i++,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&$movekey ($rndkey0,&QWP(0,$key_));
&pand ($inout5,$twmask); # isolate carry and residue
&movups ($inout0,&QWP(0,$inp)); # load input
&pxor ($inout5,$tweak);
# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
&movdqu ($inout1,&QWP(16*1,$inp));
&xorps ($inout0,$rndkey0); # input^=rndkey[0]
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout1,$rndkey0);
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout2,$rndkey0);
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout3,$rndkey0);
&movdqu ($rndkey1,&QWP(16*5,$inp));
&pxor ($inout4,$rndkey0);
&lea ($inp,&DWP(16*6,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
&pxor ($inout5,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key_));
&lea ($key,&DWP(32,$key_));
&pxor ($inout1,&QWP(16*1,"esp"));
&aesdec ($inout0,$rndkey1);
&pxor ($inout2,&QWP(16*2,"esp"));
&aesdec ($inout1,$rndkey1);
&pxor ($inout3,&QWP(16*3,"esp"));
&dec ($rounds);
&aesdec ($inout2,$rndkey1);
&pxor ($inout4,&QWP(16*4,"esp"));
&aesdec ($inout3,$rndkey1);
&pxor ($inout5,$rndkey0);
&aesdec ($inout4,$rndkey1);
&$movekey ($rndkey0,&QWP(0,$key));
&aesdec ($inout5,$rndkey1);
&call (&label("_aesni_decrypt6_enter"));
&movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
&pxor ($twtmp,$twtmp);
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&xorps ($inout1,&QWP(16*1,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*2,$out),$inout2);
&xorps ($inout4,&QWP(16*4,"esp"));
&movups (&QWP(16*3,$out),$inout3);
&xorps ($inout5,$tweak);
&movups (&QWP(16*4,$out),$inout4);
&pshufd ($twres,$twtmp,0x13);
&movups (&QWP(16*5,$out),$inout5);
&lea ($out,&DWP(16*6,$out));
&movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&mov ($rounds,$rounds_); # restore $rounds
&pxor ($tweak,$twres);
&sub ($len,16*6);
&jnc (&label("xts_dec_loop6"));
&lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
&mov ($key,$key_); # restore $key
&mov ($rounds_,$rounds);
&set_label("xts_dec_short");
&add ($len,16*6);
&jz (&label("xts_dec_done6x"));
&movdqa ($inout3,$tweak); # put aside previous tweak
&cmp ($len,0x20);
&jb (&label("xts_dec_one"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&je (&label("xts_dec_two"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout4,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&cmp ($len,0x40);
&jb (&label("xts_dec_three"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout5,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&movdqa (&QWP(16*0,"esp"),$inout3);
&movdqa (&QWP(16*1,"esp"),$inout4);
&je (&label("xts_dec_four"));
&movdqa (&QWP(16*2,"esp"),$inout5);
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*3,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($inout0,1);
&pand ($inout5,$twmask); # isolate carry and residue
&pxor ($inout5,$tweak);
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout1,&QWP(16*1,"esp"));
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout2,&QWP(16*2,"esp"));
&lea ($inp,&DWP(16*5,$inp));
&pxor ($inout3,&QWP(16*3,"esp"));
&movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
&pxor ($inout4,$inout5);
&call ("_aesni_decrypt6");
&movaps ($tweak,&QWP(16*4,"esp")); # last tweak
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout4,$tweak);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&movups (&QWP(16*4,$out),$inout4);
&lea ($out,&DWP(16*5,$out));
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_one",16);
&movups ($inout0,&QWP(16*0,$inp)); # load input
&lea ($inp,&DWP(16*1,$inp));
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(16*0,$out),$inout0); # write output
&lea ($out,&DWP(16*1,$out));
&movdqa ($tweak,$inout3); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_two",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&lea ($inp,&DWP(16*2,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&call ("_aesni_decrypt3");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&lea ($out,&DWP(16*2,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_three",16);
&movaps ($inout5,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&lea ($inp,&DWP(16*3,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&call ("_aesni_decrypt3");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&lea ($out,&DWP(16*3,$out));
&movdqa ($tweak,$inout5); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_four",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movups ($inout3,&QWP(16*3,$inp));
&lea ($inp,&DWP(16*4,$inp));
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&xorps ($inout3,$inout4);
&call ("_aesni_decrypt4");
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,$inout4);
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&lea ($out,&DWP(16*4,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&and ($len,15);
&jz (&label("xts_dec_ret"));
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&jmp (&label("xts_dec_only_one_more"));
&set_label("xts_dec_done",16);
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&pxor ($twtmp,$twtmp);
&and ($len,15);
&jz (&label("xts_dec_ret"));
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($twmask,&QWP(16*6,"esp"));
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&set_label("xts_dec_only_one_more");
&pshufd ($inout3,$twtmp,0x13);
&movdqa ($inout4,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($inout3,$twmask); # isolate carry and residue
&pxor ($inout3,$tweak);
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&movups ($inout0,&QWP(0,$inp)); # load input
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(0,$out),$inout0); # write output
&set_label("xts_dec_steal");
&movz ($rounds,&BP(16,$inp));
&movz ($key,&BP(0,$out));
&lea ($inp,&DWP(1,$inp));
&mov (&BP(0,$out),&LB($rounds));
&mov (&BP(16,$out),&LB($key));
&lea ($out,&DWP(1,$out));
&sub ($len,1);
&jnz (&label("xts_dec_steal"));
&sub ($out,&DWP(16*7+0,"esp")); # rewind $out
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&movups ($inout0,&QWP(0,$out)); # load input
&xorps ($inout0,$inout4); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout4); # output^=tweak
&movups (&QWP(0,$out),$inout0); # write output
&set_label("xts_dec_ret");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_decrypt");
}
}
######################################################################
......@@ -764,34 +1721,38 @@ if ($PREFIX eq "aesni") {
# unsigned char *ivp,const int enc);
&function_begin("${PREFIX}_cbc_encrypt");
&mov ($inp,&wparam(0));
&mov ($rounds_,"esp");
&mov ($out,&wparam(1));
&sub ($rounds_,24);
&mov ($len,&wparam(2));
&and ($rounds_,-16);
&mov ($key,&wparam(3));
&mov ($key_,&wparam(4));
&test ($len,$len);
&jz (&label("cbc_ret"));
&jz (&label("cbc_abort"));
&cmp (&wparam(5),0);
&movdqu ($ivec,&QWP(0,$key_)); # load IV
&xchg ($rounds_,"esp"); # alloca
&movups ($ivec,&QWP(0,$key_)); # load IV
&mov ($rounds,&DWP(240,$key));
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&mov ($key_,$key); # backup $key
&mov (&DWP(16,"esp"),$rounds_); # save original %esp
&mov ($rounds_,$rounds); # backup $rounds
&je (&label("cbc_decrypt"));
&movdqa ($inout0,$ivec);
&movaps ($inout0,$ivec);
&cmp ($len,16);
&jb (&label("cbc_enc_tail"));
&sub ($len,16);
&jmp (&label("cbc_enc_loop"));
&set_label("cbc_enc_loop",16);
&movdqu ($ivec,&QWP(0,$inp));
&movups ($ivec,&QWP(0,$inp)); # input actually
&lea ($inp,&DWP(16,$inp));
&pxor ($inout0,$ivec);
if ($inline)
{ &aesni_inline_generate1("enc"); }
{ &aesni_inline_generate1("enc",$inout0,$ivec); }
else
{ &call ("_aesni_encrypt1"); }
{ &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
&mov ($rounds,$rounds_); # restore $rounds
&mov ($key,$key_); # restore $key
&movups (&QWP(0,$out),$inout0); # store output
......@@ -817,61 +1778,97 @@ if ($PREFIX eq "aesni") {
&jmp (&label("cbc_enc_loop"));
######################################################################
&set_label("cbc_decrypt",16);
&cmp ($len,0x40);
&cmp ($len,0x50);
&jbe (&label("cbc_dec_tail"));
&sub ($len,0x40);
&jmp (&label("cbc_dec_loop3"));
&movaps (&QWP(0,"esp"),$ivec); # save IV
&sub ($len,0x50);
&jmp (&label("cbc_dec_loop6_enter"));
&set_label("cbc_dec_loop3",16);
&movups ($inout0,&QWP(0,$inp));
&movups ($inout1,&QWP(0x10,$inp));
&movups ($inout2,&QWP(0x20,$inp));
&movaps ($in0,$inout0);
&movaps ($in1,$inout1);
&call ("_aesni_decrypt3");
&pxor ($inout0,$ivec);
&pxor ($inout1,$in0);
&movdqu ($ivec,&QWP(0x20,$inp));
&lea ($inp,&DWP(0x30,$inp));
&pxor ($inout2,$in1);
&movdqu (&QWP(0,$out),$inout0);
&mov ($rounds,$rounds_) # restore $rounds
&movdqu (&QWP(0x10,$out),$inout1);
&mov ($key,$key_); # restore $key
&movdqu (&QWP(0x20,$out),$inout2);
&lea ($out,&DWP(0x30,$out));
&sub ($len,0x30);
&ja (&label("cbc_dec_loop3"));
&add ($len,0x40);
&set_label("cbc_dec_loop6",16);
&movaps (&QWP(0,"esp"),$rndkey0); # save IV
&movups (&QWP(0,$out),$inout5);
&lea ($out,&DWP(0x10,$out));
&set_label("cbc_dec_loop6_enter");
&movdqu ($inout0,&QWP(0,$inp));
&movdqu ($inout1,&QWP(0x10,$inp));
&movdqu ($inout2,&QWP(0x20,$inp));
&movdqu ($inout3,&QWP(0x30,$inp));
&movdqu ($inout4,&QWP(0x40,$inp));
&movdqu ($inout5,&QWP(0x50,$inp));
&call ("_aesni_decrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,&QWP(0,"esp")); # ^=IV
&xorps ($inout1,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout2,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout3,$rndkey1);
&movups ($rndkey1,&QWP(0x40,$inp));
&xorps ($inout4,$rndkey0);
&movups ($rndkey0,&QWP(0x50,$inp)); # IV
&xorps ($inout5,$rndkey1);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&lea ($inp,&DWP(0x60,$inp));
&movups (&QWP(0x20,$out),$inout2);
&mov ($rounds,$rounds_) # restore $rounds
&movups (&QWP(0x30,$out),$inout3);
&mov ($key,$key_); # restore $key
&movups (&QWP(0x40,$out),$inout4);
&lea ($out,&DWP(0x50,$out));
&sub ($len,0x60);
&ja (&label("cbc_dec_loop6"));
&movaps ($inout0,$inout5);
&movaps ($ivec,$rndkey0);
&add ($len,0x50);
&jle (&label("cbc_dec_tail_collected"));
&movups (&QWP(0,$out),$inout0);
&lea ($out,&DWP(0x10,$out));
&set_label("cbc_dec_tail");
&movups ($inout0,&QWP(0,$inp));
&movaps ($in0,$inout0);
&cmp ($len,0x10);
&jbe (&label("cbc_dec_one"));
&movups ($inout1,&QWP(0x10,$inp));
&movaps ($in1,$inout1);
&cmp ($len,0x20);
&jbe (&label("cbc_dec_two"));
&movups ($inout2,&QWP(0x20,$inp));
&cmp ($len,0x30);
&jbe (&label("cbc_dec_three"));
&movups ($inout3,&QWP(0x30,$inp));
&call ("_aesni_decrypt4");
&movdqu ($rndkey0,&QWP(0x10,$inp));
&movdqu ($rndkey1,&QWP(0x20,$inp));
&pxor ($inout0,$ivec);
&pxor ($inout1,$in0);
&movdqu ($ivec,&QWP(0x30,$inp));
&movdqu (&QWP(0,$out),$inout0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey1);
&movdqu (&QWP(0x10,$out),$inout1);
&movdqu (&QWP(0x20,$out),$inout2);
&movdqa ($inout0,$inout3);
&lea ($out,&DWP(0x30,$out));
&cmp ($len,0x40);
&jbe (&label("cbc_dec_four"));
&movups ($inout4,&QWP(0x40,$inp));
&movaps (&QWP(0,"esp"),$ivec); # save IV
&movups ($inout0,&QWP(0,$inp));
&xorps ($inout5,$inout5);
&call ("_aesni_decrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,&QWP(0,"esp")); # ^= IV
&xorps ($inout1,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout2,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout3,$rndkey1);
&movups ($ivec,&QWP(0x40,$inp)); # IV
&xorps ($inout4,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&lea ($out,&DWP(0x40,$out));
&movaps ($inout0,$inout4);
&sub ($len,0x50);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_one",16);
......@@ -879,51 +1876,70 @@ if ($PREFIX eq "aesni") {
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&pxor ($inout0,$ivec);
&movdqa ($ivec,$in0);
&xorps ($inout0,$ivec);
&movaps ($ivec,$in0);
&sub ($len,0x10);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_two",16);
&pxor ($inout2,$inout2);
&xorps ($inout2,$inout2);
&call ("_aesni_decrypt3");
&pxor ($inout0,$ivec);
&pxor ($inout1,$in0);
&movdqu (&QWP(0,$out),$inout0);
&movdqa ($inout0,$inout1);
&movdqa ($ivec,$in1);
&xorps ($inout0,$ivec);
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout1);
&lea ($out,&DWP(0x10,$out));
&movaps ($ivec,$in1);
&sub ($len,0x20);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_three",16);
&call ("_aesni_decrypt3");
&pxor ($inout0,$ivec);
&pxor ($inout1,$in0);
&pxor ($inout2,$in1);
&movdqu (&QWP(0,$out),$inout0);
&movdqu (&QWP(0x10,$out),$inout1);
&movdqa ($inout0,$inout2);
&movdqu ($ivec,&QWP(0x20,$inp));
&xorps ($inout0,$ivec);
&xorps ($inout1,$in0);
&xorps ($inout2,$in1);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout2);
&movups (&QWP(0x10,$out),$inout1);
&lea ($out,&DWP(0x20,$out));
&movups ($ivec,&QWP(0x20,$inp));
&sub ($len,0x30);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_four",16);
&call ("_aesni_decrypt4");
&movups ($rndkey1,&QWP(0x10,$inp));
&movups ($rndkey0,&QWP(0x20,$inp));
&xorps ($inout0,$ivec);
&movups ($ivec,&QWP(0x30,$inp));
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
&lea ($out,&DWP(0x30,$out));
&movaps ($inout0,$inout3);
&sub ($len,0x40);
&set_label("cbc_dec_tail_collected");
&and ($len,15);
&jnz (&label("cbc_dec_tail_partial"));
&movdqu (&QWP(0,$out),$inout0);
&movups (&QWP(0,$out),$inout0);
&jmp (&label("cbc_ret"));
&set_label("cbc_dec_tail_partial",16);
&mov ($key_,"esp");
&sub ("esp",16);
&and ("esp",-16);
&movdqa (&QWP(0,"esp"),$inout0);
&movaps (&QWP(0,"esp"),$inout0);
&mov ("ecx",16);
&mov ($inp,"esp");
&mov ("ecx",$len);
&sub ("ecx",$len);
&data_word(0xA4F3F689); # rep movsb
&mov ("esp",$key_);
&set_label("cbc_ret");
&mov ("esp",&DWP(16,"esp")); # pull original %esp
&mov ($key_,&wparam(4));
&movups (&QWP(0,$key_),$ivec); # output IV
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
######################################################################
......@@ -945,7 +1961,7 @@ if ($PREFIX eq "aesni") {
&jz (&label("bad_pointer"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
&lea ($key,&DWP(16,$key));
&cmp ($rounds,256);
&je (&label("14rounds"));
......@@ -987,11 +2003,11 @@ if ($PREFIX eq "aesni") {
&lea ($key,&DWP(16,$key));
&set_label("key_128_cold");
&shufps ("xmm4","xmm0",0b00010000);
&pxor ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100,);
&pxor ("xmm0","xmm4");
&pshufd ("xmm1","xmm1",0b11111111); # critical path
&pxor ("xmm0","xmm1");
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("12rounds",16);
......@@ -1026,11 +2042,11 @@ if ($PREFIX eq "aesni") {
&movaps ("xmm5","xmm2");
&set_label("key_192b_warm");
&shufps ("xmm4","xmm0",0b00010000);
&movaps ("xmm3","xmm2");
&pxor ("xmm0","xmm4");
&movdqa ("xmm3","xmm2");
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&pslldq ("xmm3",4);
&pxor ("xmm0","xmm4");
&xorps ("xmm0","xmm4");
&pshufd ("xmm1","xmm1",0b01010101); # critical path
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm1");
......@@ -1089,11 +2105,11 @@ if ($PREFIX eq "aesni") {
&lea ($key,&DWP(16,$key));
&set_label("key_256a_cold");
&shufps ("xmm4","xmm0",0b00010000);
&pxor ("xmm0","xmm4");
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&pxor ("xmm0","xmm4");
&pshufd ("xmm1","xmm1",0b11111111); # critical path
&pxor ("xmm0","xmm1");
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("key_256b",16);
......@@ -1101,11 +2117,11 @@ if ($PREFIX eq "aesni") {
&lea ($key,&DWP(16,$key));
&shufps ("xmm4","xmm2",0b00010000);
&pxor ("xmm2","xmm4");
&xorps ("xmm2","xmm4");
&shufps ("xmm4","xmm2",0b10001100);
&pxor ("xmm2","xmm4");
&pshufd ("xmm1","xmm1",0b10101010); # critical path
&pxor ("xmm2","xmm1");
&xorps ("xmm2","xmm4");
&shufps ("xmm1","xmm1",0b10101010); # critical path
&xorps ("xmm2","xmm1");
&ret();
&set_label("bad_pointer",4);
......
......@@ -18,7 +18,7 @@
# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
# processed with 128-bit key. And given their throughput asymptotic
# performance for parallelizable modes is 1.25 cycles per byte. Being
# asymptotic limit is not something you commonly achieve in reality,
# asymptotic limit it's not something you commonly achieve in reality,
# but how close does one get? Below are results collected for
# different modes and block sized. Pairs of numbers are for en-/
# decryption.
......@@ -79,6 +79,84 @@
# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
# January 2011
#
# While Westmere processor features 6 cycles latency for aes[enc|dec]
# instructions, which can be scheduled every second cycle, Sandy
# Bridge spends 8 cycles per instruction, but it can schedule them
# every cycle. This means that code targeting Westmere would perform
# suboptimally on Sandy Bridge. Therefore this update.
#
# In addition, non-parallelizable CBC encrypt (as well as CCM) is
# optimized. Relative improvement might appear modest, 8% on Westmere,
# but in absolute terms it's 3.77 cycles per byte encrypted with
# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
# should be compared to asymptotic limits of 3.75 for Westmere and
# 5.00 for Sandy Bridge. Actually, the fact that they get this close
# to asymptotic limits is quite amazing. Indeed, the limit is
# calculated as latency times number of rounds, 10 for 128-bit key,
# and divided by 16, the number of bytes in block, or in other words
# it accounts *solely* for aesenc instructions. But there are extra
# instructions, and numbers so close to the asymptotic limits mean
# that it's as if it takes as little as *one* additional cycle to
# execute all of them. How is it possible? It is possible thanks to
# out-of-order execution logic, which manages to overlap post-
# processing of previous block, things like saving the output, with
# actual encryption of current block, as well as pre-processing of
# current block, things like fetching input and xor-ing it with
# 0-round element of the key schedule, with actual encryption of
# previous block. Keep this in mind...
#
# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
# not improve performance. The formula has proven to reflect reality
# pretty well on Westmere... Sandy Bridge on the other hand can
# execute up to 8 AES instructions at a time, so how does varying
# interleave factor affect the performance? Here is table for ECB
# (numbers are cycles per byte processed with 128-bit key):
#
# instruction interleave factor 3x 6x 8x
# theoretical asymptotic limit 1.67 0.83 0.625
# measured performance for 8KB block 1.05 0.86 0.84
#
# "as if" interleave factor 4.7x 5.8x 6.0x
#
# Further data for other parallelizable modes:
#
# CBC decrypt 1.16 0.93 0.93
# CTR 1.14 0.91 n/a
#
# Well, given 3x column it's probably inappropriate to call the limit
# asymptotic, if it can be surpassed, isn't it? What happens there?
# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
# magic is responsible for this. Processor overlaps not only the
# additional instructions with AES ones, but even AES instuctions
# processing adjacent triplets of independent blocks. In the 6x case
# additional instructions still claim disproportionally small amount
# of additional cycles, but in 8x case number of instructions must be
# a tad too high for out-of-order logic to cope with, and AES unit
# remains underutilized... As you can see 8x interleave is hardly
# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
# utilizies 6x interleave because of limited register bank capacity.
#
# Higher interleave factors do have negative impact on Westmere
# performance. While for ECB mode it's negligible ~1.5%, other
# parallelizables perform ~5% worse, which is outweighed by ~25%
# improvement on Sandy Bridge. To balance regression on Westmere
# CTR mode was implemented with 6x aesenc interleave factor.
# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-x86_64.pl:-)
......@@ -114,12 +192,14 @@ $rnds_="%r10d"; # backup copy for $rounds
$key_="%r11"; # backup copy for $key
# %xmm register layout
$inout0="%xmm0"; $inout1="%xmm1";
$inout2="%xmm2"; $inout3="%xmm3";
$rndkey0="%xmm4"; $rndkey1="%xmm5";
$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR, ...
$in1="%xmm8"; $in2="%xmm9";
$rndkey0="%xmm0"; $rndkey1="%xmm1";
$inout0="%xmm2"; $inout1="%xmm3";
$inout2="%xmm4"; $inout3="%xmm5";
$inout4="%xmm6"; $inout5="%xmm7";
$inout6="%xmm8"; $inout7="%xmm9";
$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
$in0="%xmm8"; $iv="%xmm9";
# Inline version of internal aesni_[en|de]crypt1.
#
......@@ -127,13 +207,22 @@ $in1="%xmm8"; $in2="%xmm9";
# cycles which take care of loop variables...
{ my $sn;
sub aesni_generate1 {
my ($p,$key,$rounds,$inout)=@_; $inout=$inout0 if (!defined($inout));
my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
++$sn;
$code.=<<___;
movdqu ($key),$rndkey0
$movkey ($key),$rndkey0
$movkey 16($key),$rndkey1
___
$code.=<<___ if (defined($ivec));
xorps $rndkey0,$ivec
lea 32($key),$key
xorps $ivec,$inout
___
$code.=<<___ if (!defined($ivec));
lea 32($key),$key
pxor $rndkey0,$inout
xorps $rndkey0,$inout
___
$code.=<<___;
.Loop_${p}1_$sn:
aes${p} $rndkey1,$inout
dec $rounds
......@@ -152,8 +241,8 @@ $code.=<<___;
.type ${PREFIX}_encrypt,\@abi-omnipotent
.align 16
${PREFIX}_encrypt:
movdqu ($inp),$inout0 # load input
mov 240($key),$rounds # pull $rounds
movups ($inp),$inout0 # load input
mov 240($key),$rounds # key->rounds
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
......@@ -165,8 +254,8 @@ $code.=<<___;
.type ${PREFIX}_decrypt,\@abi-omnipotent
.align 16
${PREFIX}_decrypt:
movdqu ($inp),$inout0 # load input
mov 240($key),$rounds # pull $rounds
movups ($inp),$inout0 # load input
mov 240($key),$rounds # key->rounds
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
......@@ -176,16 +265,16 @@ $code.=<<___;
___
}
# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
# latency is 6, it turned out that it can be scheduled only every
# *second* cycle. Thus 3x interleave is the one providing optimal
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it makes no sense to implement 2x subroutine. As soon
# as/if Intel improves throughput by making it possible to schedule
# the instructions in question *every* cycles I would have to
# implement 6x interleave and use it in loop...
# This is why it makes no sense to implement 2x subroutine.
# aes[enc|dec] latency in next processor generation is 8, but the
# instructions can be scheduled every cycle. Optimal interleave for
# new processor is therefore 8x...
sub aesni_generate3 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
......@@ -198,9 +287,9 @@ _aesni_${dir}rypt3:
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
pxor $rndkey0,$inout0
pxor $rndkey0,$inout1
pxor $rndkey0,$inout2
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
$movkey ($key),$rndkey0
.L${dir}_loop3:
......@@ -242,11 +331,11 @@ _aesni_${dir}rypt4:
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
pxor $rndkey0,$inout0
pxor $rndkey0,$inout1
pxor $rndkey0,$inout2
pxor $rndkey0,$inout3
$movkey ($key),$rndkey0
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
xorps $rndkey0,$inout3
$movkey ($key),$rndkey0
.L${dir}_loop4:
aes${dir} $rndkey1,$inout0
......@@ -275,10 +364,155 @@ _aesni_${dir}rypt4:
.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
___
}
sub aesni_generate6 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-5] is cipher/clear text...
$code.=<<___;
.type _aesni_${dir}rypt6,\@abi-omnipotent
.align 16
_aesni_${dir}rypt6:
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
xorps $rndkey0,$inout0
pxor $rndkey0,$inout1
aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
aes${dir} $rndkey1,$inout3
pxor $rndkey0,$inout5
dec $rounds
aes${dir} $rndkey1,$inout4
$movkey ($key),$rndkey0
aes${dir} $rndkey1,$inout5
jmp .L${dir}_loop6_enter
.align 16
.L${dir}_loop6:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
dec $rounds
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
.L${dir}_loop6_enter: # happens to be 16-byte aligned
$movkey 16($key),$rndkey1
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
$movkey ($key),$rndkey0
jnz .L${dir}_loop6
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir}last $rndkey0,$inout0
aes${dir}last $rndkey0,$inout1
aes${dir}last $rndkey0,$inout2
aes${dir}last $rndkey0,$inout3
aes${dir}last $rndkey0,$inout4
aes${dir}last $rndkey0,$inout5
ret
.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
___
}
sub aesni_generate8 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-7] is cipher/clear text...
$code.=<<___;
.type _aesni_${dir}rypt8,\@abi-omnipotent
.align 16
_aesni_${dir}rypt8:
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
aes${dir} $rndkey1,$inout3
pxor $rndkey0,$inout5
dec $rounds
aes${dir} $rndkey1,$inout4
pxor $rndkey0,$inout6
aes${dir} $rndkey1,$inout5
pxor $rndkey0,$inout7
$movkey ($key),$rndkey0
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
$movkey 16($key),$rndkey1
jmp .L${dir}_loop8_enter
.align 16
.L${dir}_loop8:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
dec $rounds
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
$movkey 16($key),$rndkey1
.L${dir}_loop8_enter: # happens to be 16-byte aligned
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
aes${dir} $rndkey0,$inout6
aes${dir} $rndkey0,$inout7
$movkey ($key),$rndkey0
jnz .L${dir}_loop8
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
aes${dir}last $rndkey0,$inout0
aes${dir}last $rndkey0,$inout1
aes${dir}last $rndkey0,$inout2
aes${dir}last $rndkey0,$inout3
aes${dir}last $rndkey0,$inout4
aes${dir}last $rndkey0,$inout5
aes${dir}last $rndkey0,$inout6
aes${dir}last $rndkey0,$inout7
ret
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
&aesni_generate6("enc") if ($PREFIX eq "aesni");
&aesni_generate6("dec");
&aesni_generate8("enc") if ($PREFIX eq "aesni");
&aesni_generate8("dec");
if ($PREFIX eq "aesni") {
########################################################################
......@@ -290,37 +524,73 @@ $code.=<<___;
.type aesni_ecb_encrypt,\@function,5
.align 16
aesni_ecb_encrypt:
cmp \$16,$len # check length
jb .Lecb_ret
mov 240($key),$rounds # pull $rounds
and \$-16,$len
jz .Lecb_ret
mov 240($key),$rounds # key->rounds
$movkey ($key),$rndkey0
mov $key,$key_ # backup $key
mov $rounds,$rnds_ # backup $rounds
test %r8d,%r8d # 5th argument
jz .Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
cmp \$0x40,$len
jbe .Lecb_enc_tail
sub \$0x40,$len
jmp .Lecb_enc_loop3
cmp \$0x80,$len
jb .Lecb_enc_tail
movdqu ($inp),$inout0
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
movdqu 0x40($inp),$inout4
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
sub \$0x80,$len
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop3:
movups ($inp),$inout0
movups 0x10($inp),$inout1
movups 0x20($inp),$inout2
call _aesni_encrypt3
lea 0x30($inp),$inp
.Lecb_enc_loop8:
movups $inout0,($out)
mov $key_,$key # restore $key
movdqu ($inp),$inout0
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
movups $inout2,0x20($out)
movdqu 0x20($inp),$inout2
movups $inout3,0x30($out)
movdqu 0x30($inp),$inout3
movups $inout4,0x40($out)
movdqu 0x40($inp),$inout4
movups $inout5,0x50($out)
movdqu 0x50($inp),$inout5
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
lea 0x80($out),$out
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
.Lecb_enc_loop8_enter:
call _aesni_encrypt8
sub \$0x80,$len
jnc .Lecb_enc_loop8
movups $inout0,($out)
mov $key_,$key # restore $key
movups $inout1,0x10($out)
mov $rnds_,$rounds # restore $rounds
movups $inout2,0x20($out)
lea 0x30($out),$out
sub \$0x30,$len
ja .Lecb_enc_loop3
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
movups $inout7,0x70($out)
lea 0x80($out),$out
add \$0x80,$len
jz .Lecb_ret
add \$0x40,$len
.Lecb_enc_tail:
movups ($inp),$inout0
cmp \$0x20,$len
......@@ -328,14 +598,24 @@ aesni_ecb_encrypt:
movups 0x10($inp),$inout1
je .Lecb_enc_two
movups 0x20($inp),$inout2
cmp \$0x30,$len
je .Lecb_enc_three
cmp \$0x40,$len
jb .Lecb_enc_three
movups 0x30($inp),$inout3
call _aesni_encrypt4
je .Lecb_enc_four
movups 0x40($inp),$inout4
cmp \$0x60,$len
jb .Lecb_enc_five
movups 0x50($inp),$inout5
je .Lecb_enc_six
movdqu 0x60($inp),$inout6
call _aesni_encrypt8
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_one:
......@@ -346,7 +626,7 @@ $code.=<<___;
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
pxor $inout2,$inout2
xorps $inout2,$inout2
call _aesni_encrypt3
movups $inout0,($out)
movups $inout1,0x10($out)
......@@ -358,30 +638,95 @@ $code.=<<___;
movups $inout1,0x10($out)
movups $inout2,0x20($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_four:
call _aesni_encrypt4
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_five:
xorps $inout5,$inout5
call _aesni_encrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_six:
call _aesni_encrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
jmp .Lecb_ret
#--------------------------- ECB DECRYPT ------------------------------#
.align 16
.Lecb_decrypt:
cmp \$0x40,$len
jbe .Lecb_dec_tail
sub \$0x40,$len
jmp .Lecb_dec_loop3
cmp \$0x80,$len
jb .Lecb_dec_tail
movdqu ($inp),$inout0
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
movdqu 0x40($inp),$inout4
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
sub \$0x80,$len
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop3:
movups ($inp),$inout0
movups 0x10($inp),$inout1
movups 0x20($inp),$inout2
call _aesni_decrypt3
lea 0x30($inp),$inp
.Lecb_dec_loop8:
movups $inout0,($out)
mov $key_,$key # restore $key
movdqu ($inp),$inout0
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
movups $inout2,0x20($out)
movdqu 0x20($inp),$inout2
movups $inout3,0x30($out)
movdqu 0x30($inp),$inout3
movups $inout4,0x40($out)
movdqu 0x40($inp),$inout4
movups $inout5,0x50($out)
movdqu 0x50($inp),$inout5
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
lea 0x80($out),$out
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
.Lecb_dec_loop8_enter:
call _aesni_decrypt8
$movkey ($key_),$rndkey0
sub \$0x80,$len
jnc .Lecb_dec_loop8
movups $inout0,($out)
mov $key_,$key # restore $key
movups $inout1,0x10($out)
mov $rnds_,$rounds # restore $rounds
movups $inout2,0x20($out)
lea 0x30($out),$out
sub \$0x30,$len
ja .Lecb_dec_loop3
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
movups $inout7,0x70($out)
lea 0x80($out),$out
add \$0x80,$len
jz .Lecb_ret
add \$0x40,$len
.Lecb_dec_tail:
movups ($inp),$inout0
cmp \$0x20,$len
......@@ -389,14 +734,25 @@ $code.=<<___;
movups 0x10($inp),$inout1
je .Lecb_dec_two
movups 0x20($inp),$inout2
cmp \$0x30,$len
je .Lecb_dec_three
cmp \$0x40,$len
jb .Lecb_dec_three
movups 0x30($inp),$inout3
call _aesni_decrypt4
je .Lecb_dec_four
movups 0x40($inp),$inout4
cmp \$0x60,$len
jb .Lecb_dec_five
movups 0x50($inp),$inout5
je .Lecb_dec_six
movups 0x60($inp),$inout6
$movkey ($key),$rndkey0
call _aesni_decrypt8
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
......@@ -407,7 +763,7 @@ $code.=<<___;
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
pxor $inout2,$inout2
xorps $inout2,$inout2
call _aesni_decrypt3
movups $inout0,($out)
movups $inout1,0x10($out)
......@@ -418,6 +774,34 @@ $code.=<<___;
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps $inout5,$inout5
call _aesni_decrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
.Lecb_ret:
ret
......@@ -467,25 +851,45 @@ $code.=<<___;
movdqa $iv,$inout0
.Lccm64_enc_outer:
movdqu ($inp),$in0 # load inp
movups ($inp),$in0 # load inp
pshufb $bswap_mask,$inout0
mov $key_,$key
mov $rnds_,$rounds
pxor $in0,$inout1 # cmac^=inp
pxor $inout2,$inout2
call _aesni_encrypt3
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
xorps $rndkey0,$in0
lea 32($key),$key
xorps $rndkey0,$inout0
xorps $inout1,$in0 # cmac^=inp
$movkey ($key),$rndkey0
.Lccm64_enc2_loop:
aesenc $rndkey1,$inout0
dec $rounds
aesenc $rndkey1,$inout1
$movkey 16($key),$rndkey1
aesenc $rndkey0,$inout0
lea 32($key),$key
aesenc $rndkey0,$inout1
$movkey 0($key),$rndkey0
jnz .Lccm64_enc2_loop
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
paddq $increment,$iv
dec $len
lea 16($inp),$inp
pxor $inout0,$in0 # inp ^= E(iv)
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
movdqu $in0,($out) # save output
movups $in0,($out) # save output
lea 16($out),$out
jnz .Lccm64_enc_outer
movdqu $inout1,($cmac)
movups $inout1,($cmac)
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
......@@ -529,24 +933,42 @@ ___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
.Lccm64_dec_outer:
movdqu ($inp),$in0 # load inp
paddq $increment,$iv
dec $len
lea 16($inp),$inp
pxor $inout0,$in0
movups ($inp),$in0 # load inp
xorps $inout0,$in0
movdqa $iv,$inout0
lea 16($inp),$inp
pshufb $bswap_mask,$inout0
mov $key_,$key
mov $rnds_,$rounds
pshufb $bswap_mask,$inout0
movdqu $in0,($out)
movups $in0,($out)
lea 16($out),$out
pxor $in0,$inout1 # cmac^=out
sub \$1,$len
jz .Lccm64_dec_break
pxor $inout2,$inout2
call _aesni_encrypt3
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
xorps $rndkey0,$in0
lea 32($key),$key
xorps $rndkey0,$inout0
xorps $in0,$inout1 # cmac^=out
$movkey ($key),$rndkey0
.Lccm64_dec2_loop:
aesenc $rndkey1,$inout0
dec $rounds
aesenc $rndkey1,$inout1
$movkey 16($key),$rndkey1
aesenc $rndkey0,$inout0
lea 32($key),$key
aesenc $rndkey0,$inout1
$movkey 0($key),$rndkey0
jnz .Lccm64_dec2_loop
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
aesenclast $rndkey0,$inout0
jmp .Lccm64_dec_outer
.align 16
......@@ -554,7 +976,7 @@ $code.=<<___;
___
&aesni_generate1("enc",$key,$rounds,$inout1);
$code.=<<___;
movdqu $inout1,($cmac)
movups $inout1,($cmac)
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
......@@ -566,219 +988,1133 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
ret
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
___
}
######################################################################
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
#
{
my $reserved = $win64?0:-0x28;
my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
my $bswap_mask="%xmm15";
$code.=<<___;
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,\@function,5
.align 16
aesni_ctr32_encrypt_blocks:
___
$code.=<<___ if ($win64);
lea -0xc8(%rsp),%rsp
movaps %xmm6,0x20(%rsp)
movaps %xmm7,0x30(%rsp)
movaps %xmm8,0x40(%rsp)
movaps %xmm9,0x50(%rsp)
movaps %xmm10,0x60(%rsp)
movaps %xmm11,0x70(%rsp)
movaps %xmm12,0x80(%rsp)
movaps %xmm13,0x90(%rsp)
movaps %xmm14,0xa0(%rsp)
movaps %xmm15,0xb0(%rsp)
.Lctr32_body:
___
$code.=<<___;
cmp \$1,$len
je .Lctr32_one_shortcut
movdqu ($ivp),$ivec
movdqa .Lbswap_mask(%rip),$bswap_mask
xor $rounds,$rounds
pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
mov 240($key),$rounds # key->rounds
bswap $rnds_
pxor $iv0,$iv0 # vector of 3 32-bit counters
pxor $iv1,$iv1 # vector of 3 32-bit counters
pinsrd \$0,$rnds_,$iv0
lea 3($rnds_),$key_
pinsrd \$0,$key_,$iv1
inc $rnds_
pinsrd \$1,$rnds_,$iv0
inc $key_
pinsrd \$1,$key_,$iv1
inc $rnds_
pinsrd \$2,$rnds_,$iv0
inc $key_
pinsrd \$2,$key_,$iv1
movdqa $iv0,$reserved(%rsp)
pshufb $bswap_mask,$iv0
movdqa $iv1,`$reserved+0x10`(%rsp)
pshufb $bswap_mask,$iv1
pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
pshufd \$`2<<6`,$iv0,$inout1
pshufd \$`1<<6`,$iv0,$inout2
cmp \$6,$len
jb .Lctr32_tail
shr \$1,$rounds
mov $key,$key_ # backup $key
mov $rounds,$rnds_ # backup $rounds
sub \$6,$len
jmp .Lctr32_loop6
.align 16
.Lctr32_loop6:
pshufd \$`3<<6`,$iv1,$inout3
por $ivec,$inout0 # merge counter-less ivec
$movkey ($key_),$rndkey0
pshufd \$`2<<6`,$iv1,$inout4
por $ivec,$inout1
$movkey 16($key_),$rndkey1
pshufd \$`1<<6`,$iv1,$inout5
por $ivec,$inout2
por $ivec,$inout3
xorps $rndkey0,$inout0
por $ivec,$inout4
por $ivec,$inout5
# inline _aesni_encrypt6 and interleave last rounds
# with own code...
pxor $rndkey0,$inout1
aesenc $rndkey1,$inout0
lea 32($key_),$key
pxor $rndkey0,$inout2
aesenc $rndkey1,$inout1
movdqa .Lincrement32(%rip),$iv1
pxor $rndkey0,$inout3
aesenc $rndkey1,$inout2
movdqa $reserved(%rsp),$iv0
pxor $rndkey0,$inout4
aesenc $rndkey1,$inout3
pxor $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
jmp .Lctr32_enc_loop6_enter
.align 16
.Lctr32_enc_loop6:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
.Lctr32_enc_loop6_enter:
$movkey 16($key),$rndkey1
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey ($key),$rndkey0
jnz .Lctr32_enc_loop6
aesenc $rndkey1,$inout0
paddd $iv1,$iv0 # increment counter vector
aesenc $rndkey1,$inout1
paddd `$reserved+0x10`(%rsp),$iv1
aesenc $rndkey1,$inout2
movdqa $iv0,$reserved(%rsp) # save counter vector
aesenc $rndkey1,$inout3
movdqa $iv1,`$reserved+0x10`(%rsp)
aesenc $rndkey1,$inout4
pshufb $bswap_mask,$iv0 # byte swap
aesenc $rndkey1,$inout5
pshufb $bswap_mask,$iv1
aesenclast $rndkey0,$inout0
movups ($inp),$in0 # load input
aesenclast $rndkey0,$inout1
movups 0x10($inp),$in1
aesenclast $rndkey0,$inout2
movups 0x20($inp),$in2
aesenclast $rndkey0,$inout3
movups 0x30($inp),$in3
aesenclast $rndkey0,$inout4
movups 0x40($inp),$rndkey1
aesenclast $rndkey0,$inout5
movups 0x50($inp),$rndkey0
lea 0x60($inp),$inp
xorps $inout0,$in0 # xor
pshufd \$`3<<6`,$iv0,$inout0
xorps $inout1,$in1
pshufd \$`2<<6`,$iv0,$inout1
movups $in0,($out) # store output
xorps $inout2,$in2
pshufd \$`1<<6`,$iv0,$inout2
movups $in1,0x10($out)
xorps $inout3,$in3
movups $in2,0x20($out)
xorps $inout4,$rndkey1
movups $in3,0x30($out)
xorps $inout5,$rndkey0
movups $rndkey1,0x40($out)
movups $rndkey0,0x50($out)
lea 0x60($out),$out
mov $rnds_,$rounds
sub \$6,$len
jnc .Lctr32_loop6
add \$6,$len
jz .Lctr32_done
mov $key_,$key # restore $key
lea 1($rounds,$rounds),$rounds # restore original value
.Lctr32_tail:
por $ivec,$inout0
movups ($inp),$in0
cmp \$2,$len
jb .Lctr32_one
por $ivec,$inout1
movups 0x10($inp),$in1
je .Lctr32_two
pshufd \$`3<<6`,$iv1,$inout3
por $ivec,$inout2
movups 0x20($inp),$in2
cmp \$4,$len
jb .Lctr32_three
pshufd \$`2<<6`,$iv1,$inout4
por $ivec,$inout3
movups 0x30($inp),$in3
je .Lctr32_four
por $ivec,$inout4
xorps $inout5,$inout5
call _aesni_encrypt6
movups 0x40($inp),$rndkey1
xorps $inout0,$in0
xorps $inout1,$in1
movups $in0,($out)
xorps $inout2,$in2
movups $in1,0x10($out)
xorps $inout3,$in3
movups $in2,0x20($out)
xorps $inout4,$rndkey1
movups $in3,0x30($out)
movups $rndkey1,0x40($out)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups ($ivp),$inout0
movups ($inp),$in0
mov 240($key),$rounds # key->rounds
.Lctr32_one:
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps $inout0,$in0
movups $in0,($out)
jmp .Lctr32_done
.align 16
.Lctr32_two:
xorps $inout2,$inout2
call _aesni_encrypt3
xorps $inout0,$in0
xorps $inout1,$in1
movups $in0,($out)
movups $in1,0x10($out)
jmp .Lctr32_done
.align 16
.Lctr32_three:
call _aesni_encrypt3
xorps $inout0,$in0
xorps $inout1,$in1
movups $in0,($out)
xorps $inout2,$in2
movups $in1,0x10($out)
movups $in2,0x20($out)
jmp .Lctr32_done
.align 16
.Lctr32_four:
call _aesni_encrypt4
xorps $inout0,$in0
xorps $inout1,$in1
movups $in0,($out)
xorps $inout2,$in2
movups $in1,0x10($out)
xorps $inout3,$in3
movups $in2,0x20($out)
movups $in3,0x30($out)
.Lctr32_done:
___
$code.=<<___ if ($win64);
movaps 0x20(%rsp),%xmm6
movaps 0x30(%rsp),%xmm7
movaps 0x40(%rsp),%xmm8
movaps 0x50(%rsp),%xmm9
movaps 0x60(%rsp),%xmm10
movaps 0x70(%rsp),%xmm11
movaps 0x80(%rsp),%xmm12
movaps 0x90(%rsp),%xmm13
movaps 0xa0(%rsp),%xmm14
movaps 0xb0(%rsp),%xmm15
lea 0xc8(%rsp),%rsp
.Lctr32_ret:
___
$code.=<<___;
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
}
######################################################################
# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2
# const unsigned char iv[16]);
#
{
my @tweak=map("%xmm$_",(10..15));
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
my $frame_size = 0x68 + ($win64?160:0);
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
lea -$frame_size(%rsp),%rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,0x60(%rsp)
movaps %xmm7,0x70(%rsp)
movaps %xmm8,0x80(%rsp)
movaps %xmm9,0x90(%rsp)
movaps %xmm10,0xa0(%rsp)
movaps %xmm11,0xb0(%rsp)
movaps %xmm12,0xc0(%rsp)
movaps %xmm13,0xd0(%rsp)
movaps %xmm14,0xe0(%rsp)
movaps %xmm15,0xf0(%rsp)
.Lxts_enc_body:
___
$code.=<<___;
movups ($ivp),@tweak[5] # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
$code.=<<___;
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
mov $len,$len_ # backup $len
and \$-16,$len
movdqa .Lxts_magic(%rip),$twmask
pxor $twtmp,$twtmp
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
___
for ($i=0;$i<4;$i++) {
$code.=<<___;
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[$i]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
pand $twmask,$twres # isolate carry and residue
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
pxor $twres,@tweak[5]
___
}
$code.=<<___;
sub \$16*6,$len
jc .Lxts_enc_short
shr \$1,$rounds
sub \$1,$rounds
mov $rounds,$rnds_
jmp .Lxts_enc_grandloop
.align 16
.Lxts_enc_grandloop:
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
pand $twmask,$twres # isolate carry and residue
movdqu `16*1`($inp),$inout1
pxor $twres,@tweak[5]
movdqu `16*2`($inp),$inout2
pxor @tweak[0],$inout0 # input^=tweak
movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
movdqu `16*4`($inp),$inout4
pxor @tweak[2],$inout2
movdqu `16*5`($inp),$inout5
lea `16*6`($inp),$inp
pxor @tweak[3],$inout3
$movkey ($key_),$rndkey0
pxor @tweak[4],$inout4
pxor @tweak[5],$inout5
# inline _aesni_encrypt6 and interleave first and last rounds
# with own code...
$movkey 16($key_),$rndkey1
pxor $rndkey0,$inout0
pxor $rndkey0,$inout1
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
aesenc $rndkey1,$inout0
lea 32($key_),$key
pxor $rndkey0,$inout2
movdqa @tweak[1],`16*1`(%rsp)
aesenc $rndkey1,$inout1
pxor $rndkey0,$inout3
movdqa @tweak[2],`16*2`(%rsp)
aesenc $rndkey1,$inout2
pxor $rndkey0,$inout4
movdqa @tweak[3],`16*3`(%rsp)
aesenc $rndkey1,$inout3
pxor $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
movdqa @tweak[4],`16*4`(%rsp)
aesenc $rndkey1,$inout4
movdqa @tweak[5],`16*5`(%rsp)
aesenc $rndkey1,$inout5
pxor $twtmp,$twtmp
pcmpgtd @tweak[5],$twtmp
jmp .Lxts_enc_loop6_enter
.align 16
.Lxts_enc_loop6:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
.Lxts_enc_loop6_enter:
$movkey 16($key),$rndkey1
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey ($key),$rndkey0
jnz .Lxts_enc_loop6
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesenc $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
aesenc $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
aesenc $rndkey1,$inout2
pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
$movkey 16($key),$rndkey1
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[0]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesenc $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
aesenc $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesenc $rndkey0,$inout2
pxor $twres,@tweak[5]
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey 32($key),$rndkey0
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[1]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesenc $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
aesenc $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesenc $rndkey1,$inout2
pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[2]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesenclast $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
aesenclast $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesenclast $rndkey0,$inout2
pxor $twres,@tweak[5]
aesenclast $rndkey0,$inout3
aesenclast $rndkey0,$inout4
aesenclast $rndkey0,$inout5
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[3]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
xorps `16*0`(%rsp),$inout0 # output^=tweak
pand $twmask,$twres # isolate carry and residue
xorps `16*1`(%rsp),$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
pxor $twres,@tweak[5]
xorps `16*2`(%rsp),$inout2
movups $inout0,`16*0`($out) # write output
xorps `16*3`(%rsp),$inout3
movups $inout1,`16*1`($out)
xorps `16*4`(%rsp),$inout4
movups $inout2,`16*2`($out)
xorps `16*5`(%rsp),$inout5
movups $inout3,`16*3`($out)
mov $rnds_,$rounds # restore $rounds
movups $inout4,`16*4`($out)
movups $inout5,`16*5`($out)
lea `16*6`($out),$out
sub \$16*6,$len
jnc .Lxts_enc_grandloop
lea 3($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_enc_short:
add \$16*6,$len
jz .Lxts_enc_done
cmp \$0x20,$len
jb .Lxts_enc_one
je .Lxts_enc_two
cmp \$0x40,$len
jb .Lxts_enc_three
je .Lxts_enc_four
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu ($inp),$inout0
pand $twmask,$twres # isolate carry and residue
movdqu 16*1($inp),$inout1
pxor $twres,@tweak[5]
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
lea 16*5($inp),$inp
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
call _aesni_encrypt6
xorps @tweak[0],$inout0
movdqa @tweak[5],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movdqu $inout0,($out)
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
movdqu $inout2,16*2($out)
movdqu $inout3,16*3($out)
movdqu $inout4,16*4($out)
lea 16*5($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_one:
movups ($inp),$inout0
lea 16*1($inp),$inp
xorps @tweak[0],$inout0
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
movups $inout0,($out)
lea 16*1($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_two:
movups ($inp),$inout0
movups 16($inp),$inout1
lea 32($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
call _aesni_encrypt3
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
movups $inout0,($out)
movups $inout1,16*1($out)
lea 16*2($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_three:
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
lea 16*3($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
call _aesni_encrypt3
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movups $inout0,($out)
movups $inout1,16*1($out)
movups $inout2,16*2($out)
lea 16*3($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_four:
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
lea 16*4($inp),$inp
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_encrypt4
xorps @tweak[0],$inout0
movdqa @tweak[5],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movups $inout0,($out)
xorps @tweak[3],$inout3
movups $inout1,16*1($out)
movups $inout2,16*2($out)
movups $inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_done:
and \$15,$len_
jz .Lxts_enc_ret
mov $len_,$len
.Lxts_enc_steal:
movzb ($inp),%eax # borrow $rounds ...
movzb -16($out),%ecx # ... and $key
lea 1($inp),$inp
mov %al,-16($out)
mov %cl,0($out)
lea 1($out),$out
sub \$1,$len
jnz .Lxts_enc_steal
sub $len_,$out # rewind $out
mov $key_,$key # restore $key
mov $rnds_,$rounds # restore $rounds
movups -16($out),$inout0
xorps @tweak[0],$inout0
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movups $inout0,-16($out)
.Lxts_enc_ret:
___
$code.=<<___ if ($win64);
movaps 0x60(%rsp),%xmm6
movaps 0x70(%rsp),%xmm7
movaps 0x80(%rsp),%xmm8
movaps 0x90(%rsp),%xmm9
movaps 0xa0(%rsp),%xmm10
movaps 0xb0(%rsp),%xmm11
movaps 0xc0(%rsp),%xmm12
movaps 0xd0(%rsp),%xmm13
movaps 0xe0(%rsp),%xmm14
movaps 0xf0(%rsp),%xmm15
___
$code.=<<___;
lea $frame_size(%rsp),%rsp
.Lxts_enc_epilogue:
ret
.size aesni_xts_encrypt,.-aesni_xts_encrypt
___
}
######################################################################
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
#
my $increment="%xmm10";
my $bswap_mask="%xmm11";
$code.=<<___;
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,\@function,5
.globl aesni_xts_decrypt
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_ctr32_encrypt_blocks:
aesni_xts_decrypt:
lea -$frame_size(%rsp),%rsp
___
$code.=<<___ if ($win64);
lea -0x68(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
.Lctr32_body:
movaps %xmm6,0x60(%rsp)
movaps %xmm7,0x70(%rsp)
movaps %xmm8,0x80(%rsp)
movaps %xmm9,0x90(%rsp)
movaps %xmm10,0xa0(%rsp)
movaps %xmm11,0xb0(%rsp)
movaps %xmm12,0xc0(%rsp)
movaps %xmm13,0xd0(%rsp)
movaps %xmm14,0xe0(%rsp)
movaps %xmm15,0xf0(%rsp)
.Lxts_dec_body:
___
$code.=<<___;
cmp \$1,$len
je .Lctr32_one_shortcut
movdqu ($ivp),$inout3
movdqa .Lincrement32(%rip),$increment
movdqa .Lbswap_mask(%rip),$bswap_mask
xor $rounds,$rounds
pextrd \$3,$inout3,$rnds_ # pull 32-bit counter
pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter
movups ($ivp),@tweak[5] # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
$code.=<<___;
xor %eax,%eax # if ($len%16) len-=16;
test \$15,$len
setnz %al
shl \$4,%rax
sub %rax,$len
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
mov $len,$len_ # backup $len
and \$-16,$len
mov 240($key),$rounds # key->rounds
pxor $iv,$iv # vector of 3 32-bit counters
bswap $rnds_
pinsrd \$0,$rnds_,$iv
inc $rnds_
pinsrd \$1,$rnds_,$iv
inc $rnds_
pinsrd \$2,$rnds_,$iv
pshufb $bswap_mask,$iv
movdqa .Lxts_magic(%rip),$twmask
pxor $twtmp,$twtmp
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
___
for ($i=0;$i<4;$i++) {
$code.=<<___;
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[$i]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
pand $twmask,$twres # isolate carry and residue
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
pxor $twres,@tweak[5]
___
}
$code.=<<___;
sub \$16*6,$len
jc .Lxts_dec_short
cmp \$4,$len
jbe .Lctr32_tail
shr \$1,$rounds
sub \$1,$rounds
mov $rounds,$rnds_
mov $key,$key_
sub \$4,$len
.Lctr32_loop3:
pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword
pshufd \$`2<<6`,$iv,$inout1
por $inout3,$inout0 # merge counter-less ivec
pshufd \$`1<<6`,$iv,$inout2
por $inout3,$inout1
por $inout3,$inout2
jmp .Lxts_dec_grandloop
# inline _aesni_encrypt3 and interleave last round
.align 16
.Lxts_dec_grandloop:
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
pand $twmask,$twres # isolate carry and residue
movdqu `16*1`($inp),$inout1
pxor $twres,@tweak[5]
movdqu `16*2`($inp),$inout2
pxor @tweak[0],$inout0 # input^=tweak
movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
movdqu `16*4`($inp),$inout4
pxor @tweak[2],$inout2
movdqu `16*5`($inp),$inout5
lea `16*6`($inp),$inp
pxor @tweak[3],$inout3
$movkey ($key_),$rndkey0
pxor @tweak[4],$inout4
pxor @tweak[5],$inout5
# inline _aesni_decrypt6 and interleave first and last rounds
# with own code...
$movkey 16($key_),$rndkey1
pxor $rndkey0,$inout0
pxor $rndkey0,$inout1
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
aesdec $rndkey1,$inout0
lea 32($key_),$key
pxor $rndkey0,$inout2
movdqa @tweak[1],`16*1`(%rsp)
aesdec $rndkey1,$inout1
pxor $rndkey0,$inout3
movdqa @tweak[2],`16*2`(%rsp)
aesdec $rndkey1,$inout2
pxor $rndkey0,$inout4
movdqa @tweak[3],`16*3`(%rsp)
aesdec $rndkey1,$inout3
pxor $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
movdqa @tweak[4],`16*4`(%rsp)
aesdec $rndkey1,$inout4
movdqa @tweak[5],`16*5`(%rsp)
aesdec $rndkey1,$inout5
pxor $twtmp,$twtmp
pcmpgtd @tweak[5],$twtmp
jmp .Lxts_dec_loop6_enter
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
pxor $rndkey0,$inout0
pxor $rndkey0,$inout1
pxor $rndkey0,$inout2
$movkey ($key),$rndkey0
jmp .Lctr32_enc_loop3
.align 16
.Lctr32_enc_loop3:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
.Lxts_dec_loop6:
aesdec $rndkey1,$inout0
aesdec $rndkey1,$inout1
dec $rounds
aesenc $rndkey1,$inout2
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
.Lxts_dec_loop6_enter:
$movkey 16($key),$rndkey1
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
aesdec $rndkey0,$inout0
aesdec $rndkey0,$inout1
lea 32($key),$key
aesenc $rndkey0,$inout2
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
aesdec $rndkey0,$inout4
aesdec $rndkey0,$inout5
$movkey ($key),$rndkey0
jnz .Lctr32_enc_loop3
jnz .Lxts_dec_loop6
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesdec $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
aesdec $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
aesdec $rndkey1,$inout2
pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
$movkey 16($key),$rndkey1
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[0]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesdec $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
aesdec $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesdec $rndkey0,$inout2
pxor $twres,@tweak[5]
aesdec $rndkey0,$inout3
aesdec $rndkey0,$inout4
aesdec $rndkey0,$inout5
$movkey 32($key),$rndkey0
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[1]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesdec $rndkey1,$inout0
pand $twmask,$twres # isolate carry and residue
aesdec $rndkey1,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesdec $rndkey1,$inout2
pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[2]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
aesdeclast $rndkey0,$inout0
pand $twmask,$twres # isolate carry and residue
aesdeclast $rndkey0,$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
aesdeclast $rndkey0,$inout2
pxor $twres,@tweak[5]
aesdeclast $rndkey0,$inout3
aesdeclast $rndkey0,$inout4
aesdeclast $rndkey0,$inout5
pshufd \$0x13,$twtmp,$twres
pxor $twtmp,$twtmp
movdqa @tweak[5],@tweak[3]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
xorps `16*0`(%rsp),$inout0 # output^=tweak
pand $twmask,$twres # isolate carry and residue
xorps `16*1`(%rsp),$inout1
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
pxor $twres,@tweak[5]
xorps `16*2`(%rsp),$inout2
movups $inout0,`16*0`($out) # write output
xorps `16*3`(%rsp),$inout3
movups $inout1,`16*1`($out)
xorps `16*4`(%rsp),$inout4
movups $inout2,`16*2`($out)
xorps `16*5`(%rsp),$inout5
movups $inout3,`16*3`($out)
mov $rnds_,$rounds # restore $rounds
movups $inout4,`16*4`($out)
movups $inout5,`16*5`($out)
lea `16*6`($out),$out
sub \$16*6,$len
jnc .Lxts_dec_grandloop
lea 3($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_dec_short:
add \$16*6,$len
jz .Lxts_dec_done
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
cmp \$0x20,$len
jb .Lxts_dec_one
je .Lxts_dec_two
pshufb $bswap_mask,$iv
movdqu ($inp),$in0
aesenclast $rndkey0,$inout0
movdqu 0x10($inp),$in1
paddd $increment,$iv
aesenclast $rndkey0,$inout1
movdqu 0x20($inp),$in2
pshufb $bswap_mask,$iv
aesenclast $rndkey0,$inout2
lea 0x30($inp),$inp
cmp \$0x40,$len
jb .Lxts_dec_three
je .Lxts_dec_four
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu ($inp),$inout0
pand $twmask,$twres # isolate carry and residue
movdqu 16*1($inp),$inout1
pxor $twres,@tweak[5]
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
lea 16*5($inp),$inp
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
call _aesni_decrypt6
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movdqu $inout0,($out)
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
movdqu $inout2,16*2($out)
pxor $twtmp,$twtmp
movdqu $inout3,16*3($out)
pcmpgtd @tweak[5],$twtmp
movdqu $inout4,16*4($out)
lea 16*5($out),$out
pshufd \$0x13,$twtmp,@tweak[1] # $twres
and \$15,$len_
jz .Lxts_dec_ret
movdqa @tweak[5],@tweak[0]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
pand $twmask,@tweak[1] # isolate carry and residue
pxor @tweak[5],@tweak[1]
jmp .Lxts_dec_done2
mov $key_,$key
pxor $inout0,$in0
sub \$3,$len
mov $rnds_,$rounds
pxor $inout1,$in1
movdqu $in0,($out)
pxor $inout2,$in2
movdqu $in1,0x10($out)
movdqu $in2,0x20($out)
lea 0x30($out),$out
ja .Lctr32_loop3
.align 16
.Lxts_dec_one:
movups ($inp),$inout0
lea 16*1($inp),$inp
xorps @tweak[0],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
movups $inout0,($out)
movdqa @tweak[2],@tweak[1]
lea 16*1($out),$out
jmp .Lxts_dec_done
pextrd \$1,$iv,$rnds_ # might need last counter value
add \$4,$len
bswap $rnds_
.align 16
.Lxts_dec_two:
movups ($inp),$inout0
movups 16($inp),$inout1
lea 32($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
.Lctr32_tail:
pshufd \$`3<<6`,$iv,$inout0
pshufd \$`2<<6`,$iv,$inout1
por $inout3,$inout0
movdqu ($inp),$in0
cmp \$2,$len
jb .Lctr32_one
lea 1($rnds_),$rnds_
pshufd \$`1<<6`,$iv,$inout2
por $inout3,$inout1
movdqu 0x10($inp),$in1
je .Lctr32_two
bswap $rnds_
por $inout3,$inout2
movdqu 0x20($inp),$in2
cmp \$3,$len
je .Lctr32_three
call _aesni_decrypt3
pinsrd \$3,$rnds_,$inout3 # compose last counter value
movdqu 0x30($inp),$iv
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[3],@tweak[1]
movups $inout0,($out)
movups $inout1,16*1($out)
lea 16*2($out),$out
jmp .Lxts_dec_done
call _aesni_encrypt4
.align 16
.Lxts_dec_three:
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
lea 16*3($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
pxor $inout0,$in0
pxor $inout1,$in1
pxor $inout2,$in2
movdqu $in0,($out)
pxor $inout3,$iv
movdqu $in1,0x10($out)
movdqu $in2,0x20($out)
movdqu $iv,0x30($out)
jmp .Lctr32_done
call _aesni_decrypt3
.align 16
.Lctr32_one_shortcut:
movdqu ($ivp),$inout0
movdqu ($inp),$in0
mov 240($key),$rounds # key->rounds
.Lctr32_one:
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
pxor $inout0,$in0
movdqu $in0,($out)
jmp .Lctr32_done
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[5],@tweak[1]
xorps @tweak[2],$inout2
movups $inout0,($out)
movups $inout1,16*1($out)
movups $inout2,16*2($out)
lea 16*3($out),$out
jmp .Lxts_dec_done
.align 16
.Lctr32_two:
pxor $inout2,$inout2
call _aesni_encrypt3
pxor $inout0,$in0
pxor $inout1,$in1
movdqu $in0,($out)
movdqu $in1,0x10($out)
jmp .Lctr32_done
.Lxts_dec_four:
pshufd \$0x13,$twtmp,$twres
movdqa @tweak[5],@tweak[4]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movups ($inp),$inout0
pand $twmask,$twres # isolate carry and residue
movups 16*1($inp),$inout1
pxor $twres,@tweak[5]
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
lea 16*4($inp),$inp
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_decrypt4
xorps @tweak[0],$inout0
movdqa @tweak[4],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[5],@tweak[1]
xorps @tweak[2],$inout2
movups $inout0,($out)
xorps @tweak[3],$inout3
movups $inout1,16*1($out)
movups $inout2,16*2($out)
movups $inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_dec_done
.align 16
.Lctr32_three:
call _aesni_encrypt3
pxor $inout0,$in0
pxor $inout1,$in1
pxor $inout2,$in2
movdqu $in0,($out)
movdqu $in1,0x10($out)
movdqu $in2,0x20($out)
.Lxts_dec_done:
and \$15,$len_
jz .Lxts_dec_ret
.Lxts_dec_done2:
mov $len_,$len
mov $key_,$key # restore $key
mov $rnds_,$rounds # restore $rounds
.Lctr32_done:
movups ($inp),$inout0
xorps @tweak[1],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[1],$inout0
movups $inout0,($out)
.Lxts_dec_steal:
movzb 16($inp),%eax # borrow $rounds ...
movzb ($out),%ecx # ... and $key
lea 1($inp),$inp
mov %al,($out)
mov %cl,16($out)
lea 1($out),$out
sub \$1,$len
jnz .Lxts_dec_steal
sub $len_,$out # rewind $out
mov $key_,$key # restore $key
mov $rnds_,$rounds # restore $rounds
movups ($out),$inout0
xorps @tweak[0],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movups $inout0,($out)
.Lxts_dec_ret:
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
movaps 0x10(%rsp),%xmm7
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
movaps 0x50(%rsp),%xmm11
lea 0x68(%rsp),%rsp
.Lctr32_ret:
movaps 0x60(%rsp),%xmm6
movaps 0x70(%rsp),%xmm7
movaps 0x80(%rsp),%xmm8
movaps 0x90(%rsp),%xmm9
movaps 0xa0(%rsp),%xmm10
movaps 0xb0(%rsp),%xmm11
movaps 0xc0(%rsp),%xmm12
movaps 0xd0(%rsp),%xmm13
movaps 0xe0(%rsp),%xmm14
movaps 0xf0(%rsp),%xmm15
___
$code.=<<___;
lea $frame_size(%rsp),%rsp
.Lxts_dec_epilogue:
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
.size aesni_xts_decrypt,.-aesni_xts_decrypt
___
}}
} }}
########################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
$reserved = $win64?0x40:-0x18; # used in decrypt
{
my $reserved = $win64?0x40:-0x18; # used in decrypt
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
.type ${PREFIX}_cbc_encrypt,\@function,6
......@@ -787,12 +2123,12 @@ ${PREFIX}_cbc_encrypt:
test $len,$len # check length
jz .Lcbc_ret
mov 240($key),$rnds_ # pull $rounds
mov 240($key),$rnds_ # key->rounds
mov $key,$key_ # backup $key
test %r9d,%r9d # 6th argument
jz .Lcbc_decrypt
#--------------------------- CBC ENCRYPT ------------------------------#
movdqu ($ivp),$inout0 # load iv as initial state
movups ($ivp),$inout0 # load iv as initial state
mov $rnds_,$rounds
cmp \$16,$len
jb .Lcbc_enc_tail
......@@ -800,11 +2136,11 @@ ${PREFIX}_cbc_encrypt:
jmp .Lcbc_enc_loop
.align 16
.Lcbc_enc_loop:
movdqu ($inp),$inout1 # load input
movups ($inp),$inout1 # load input
lea 16($inp),$inp
pxor $inout1,$inout0
#xorps $inout1,$inout0
___
&aesni_generate1("enc",$key,$rounds);
&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
$code.=<<___;
mov $rnds_,$rounds # restore $rounds
mov $key_,$key # restore $key
......@@ -846,106 +2182,251 @@ ___
$code.=<<___;
movups ($ivp),$iv
mov $rnds_,$rounds
cmp \$0x40,$len
cmp \$0x70,$len
jbe .Lcbc_dec_tail
sub \$0x40,$len
jmp .Lcbc_dec_loop3
shr \$1,$rnds_
sub \$0x70,$len
mov $rnds_,$rounds
movaps $iv,$reserved(%rsp)
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop3:
movups ($inp),$inout0
.Lcbc_dec_loop8:
movaps $rndkey0,$reserved(%rsp) # save IV
movups $inout7,($out)
lea 0x10($out),$out
.Lcbc_dec_loop8_enter:
$movkey ($key),$rndkey0
movups ($inp),$inout0 # load input
movups 0x10($inp),$inout1
movups 0x20($inp),$inout2
movaps $inout0,$in0
movaps $inout1,$in1
movaps $inout2,$in2
call _aesni_decrypt3
$movkey 16($key),$rndkey1
sub \$0x30,$len
lea 0x30($inp),$inp
lea 0x30($out),$out
pxor $iv,$inout0
pxor $in0,$inout1
movaps $in2,$iv
pxor $in1,$inout2
movdqu $inout0,-0x30($out)
mov $rnds_,$rounds # restore $rounds
movdqu $inout1,-0x20($out)
mov $key_,$key # restore $key
movdqu $inout2,-0x10($out)
ja .Lcbc_dec_loop3
lea 32($key),$key
movdqu 0x20($inp),$inout2
xorps $rndkey0,$inout0
movdqu 0x30($inp),$inout3
xorps $rndkey0,$inout1
movdqu 0x40($inp),$inout4
aesdec $rndkey1,$inout0
pxor $rndkey0,$inout2
movdqu 0x50($inp),$inout5
aesdec $rndkey1,$inout1
pxor $rndkey0,$inout3
movdqu 0x60($inp),$inout6
aesdec $rndkey1,$inout2
pxor $rndkey0,$inout4
movdqu 0x70($inp),$inout7
aesdec $rndkey1,$inout3
pxor $rndkey0,$inout5
dec $rounds
aesdec $rndkey1,$inout4
pxor $rndkey0,$inout6
aesdec $rndkey1,$inout5
pxor $rndkey0,$inout7
$movkey ($key),$rndkey0
aesdec $rndkey1,$inout6
aesdec $rndkey1,$inout7
$movkey 16($key),$rndkey1
add \$0x40,$len
movups $iv,($ivp)
call .Ldec_loop8_enter
movups ($inp),$rndkey1 # re-load input
movups 0x10($inp),$rndkey0
xorps $reserved(%rsp),$inout0 # ^= IV
xorps $rndkey1,$inout1
movups 0x20($inp),$rndkey1
xorps $rndkey0,$inout2
movups 0x30($inp),$rndkey0
xorps $rndkey1,$inout3
movups 0x40($inp),$rndkey1
xorps $rndkey0,$inout4
movups 0x50($inp),$rndkey0
xorps $rndkey1,$inout5
movups 0x60($inp),$rndkey1
xorps $rndkey0,$inout6
movups 0x70($inp),$rndkey0 # IV
xorps $rndkey1,$inout7
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
mov $rnds_,$rounds # restore $rounds
movups $inout4,0x40($out)
mov $key_,$key # restore $key
movups $inout5,0x50($out)
lea 0x80($inp),$inp
movups $inout6,0x60($out)
lea 0x70($out),$out
sub \$0x80,$len
ja .Lcbc_dec_loop8
movaps $inout7,$inout0
movaps $rndkey0,$iv
add \$0x70,$len
jle .Lcbc_dec_tail_collected
movups $inout0,($out)
lea 1($rnds_,$rnds_),$rounds
lea 0x10($out),$out
.Lcbc_dec_tail:
movups ($inp),$inout0
movaps $inout0,$in0
cmp \$0x10,$len
jbe .Lcbc_dec_one
movups 0x10($inp),$inout1
movaps $inout1,$in1
cmp \$0x20,$len
jbe .Lcbc_dec_two
movups 0x20($inp),$inout2
movaps $inout2,$in2
cmp \$0x30,$len
jbe .Lcbc_dec_three
movups 0x30($inp),$inout3
call _aesni_decrypt4
pxor $iv,$inout0
movups 0x30($inp),$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
movdqa $inout3,$inout0
lea 0x30($out),$out
cmp \$0x40,$len
jbe .Lcbc_dec_four
movups 0x40($inp),$inout4
cmp \$0x50,$len
jbe .Lcbc_dec_five
movups 0x50($inp),$inout5
cmp \$0x60,$len
jbe .Lcbc_dec_six
movups 0x60($inp),$inout6
movaps $iv,$reserved(%rsp) # save IV
call _aesni_decrypt8
movups ($inp),$rndkey1
movups 0x10($inp),$rndkey0
xorps $reserved(%rsp),$inout0 # ^= IV
xorps $rndkey1,$inout1
movups 0x20($inp),$rndkey1
xorps $rndkey0,$inout2
movups 0x30($inp),$rndkey0
xorps $rndkey1,$inout3
movups 0x40($inp),$rndkey1
xorps $rndkey0,$inout4
movups 0x50($inp),$rndkey0
xorps $rndkey1,$inout5
movups 0x60($inp),$iv # IV
xorps $rndkey0,$inout6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
lea 0x60($out),$out
movaps $inout6,$inout0
sub \$0x70,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_one:
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
pxor $iv,$inout0
xorps $iv,$inout0
movaps $in0,$iv
sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
pxor $inout2,$inout2
xorps $inout2,$inout2
call _aesni_decrypt3
pxor $iv,$inout0
pxor $in0,$inout1
movdqu $inout0,($out)
xorps $iv,$inout0
xorps $in0,$inout1
movups $inout0,($out)
movaps $in1,$iv
movdqa $inout1,$inout0
movaps $inout1,$inout0
lea 0x10($out),$out
sub \$0x20,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
call _aesni_decrypt3
pxor $iv,$inout0
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
xorps $iv,$inout0
xorps $in0,$inout1
movups $inout0,($out)
xorps $in1,$inout2
movups $inout1,0x10($out)
movaps $in2,$iv
movdqa $inout2,$inout0
movaps $inout2,$inout0
lea 0x20($out),$out
sub \$0x30,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
call _aesni_decrypt4
xorps $iv,$inout0
movups 0x30($inp),$iv
xorps $in0,$inout1
movups $inout0,($out)
xorps $in1,$inout2
movups $inout1,0x10($out)
xorps $in2,$inout3
movups $inout2,0x20($out)
movaps $inout3,$inout0
lea 0x30($out),$out
sub \$0x40,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_five:
xorps $inout5,$inout5
call _aesni_decrypt6
movups 0x10($inp),$rndkey1
movups 0x20($inp),$rndkey0
xorps $iv,$inout0
xorps $in0,$inout1
xorps $rndkey1,$inout2
movups 0x30($inp),$rndkey1
xorps $rndkey0,$inout3
movups 0x40($inp),$iv
xorps $rndkey1,$inout4
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
lea 0x40($out),$out
movaps $inout4,$inout0
sub \$0x50,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_six:
call _aesni_decrypt6
movups 0x10($inp),$rndkey1
movups 0x20($inp),$rndkey0
xorps $iv,$inout0
xorps $in0,$inout1
xorps $rndkey1,$inout2
movups 0x30($inp),$rndkey1
xorps $rndkey0,$inout3
movups 0x40($inp),$rndkey0
xorps $rndkey1,$inout4
movups 0x50($inp),$iv
xorps $rndkey0,$inout5
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
lea 0x50($out),$out
movaps $inout5,$inout0
sub \$0x60,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_tail_collected:
and \$15,$len
movups $iv,($ivp)
jnz .Lcbc_dec_tail_partial
movdqu $inout0,($out)
movups $inout0,($out)
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps $inout0,$reserved(%rsp)
mov \$16,%rcx
mov $out,%rdi
mov $len,%rcx
sub $len,%rcx
lea $reserved(%rsp),%rsi
.long 0x9066A4F3 # rep movsb
......@@ -963,7 +2444,7 @@ $code.=<<___;
ret
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
___
}
# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
# int bits, AES_KEY *key)
{ my ($inp,$bits,$key) = @_4args;
......@@ -1033,7 +2514,7 @@ __aesni_set_encrypt_key:
jz .Lenc_key_ret
movups ($inp),%xmm0 # pull first 128 bits of *userKey
pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0
xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
lea 16($key),%rax
cmp \$256,$bits
je .L14rounds
......@@ -1148,11 +2629,11 @@ __aesni_set_encrypt_key:
lea 16(%rax),%rax
.Lkey_expansion_128_cold:
shufps \$0b00010000,%xmm0,%xmm4
pxor %xmm4, %xmm0
xorps %xmm4, %xmm0
shufps \$0b10001100,%xmm0,%xmm4
pxor %xmm4, %xmm0
pshufd \$0b11111111,%xmm1,%xmm1 # critical path
pxor %xmm1,%xmm0
xorps %xmm4, %xmm0
shufps \$0b11111111,%xmm1,%xmm1 # critical path
xorps %xmm1,%xmm0
ret
.align 16
......@@ -1163,11 +2644,11 @@ __aesni_set_encrypt_key:
movaps %xmm2, %xmm5
.Lkey_expansion_192b_warm:
shufps \$0b00010000,%xmm0,%xmm4
movaps %xmm2,%xmm3
pxor %xmm4,%xmm0
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
shufps \$0b10001100,%xmm0,%xmm4
pslldq \$4,%xmm3
pxor %xmm4,%xmm0
xorps %xmm4,%xmm0
pshufd \$0b01010101,%xmm1,%xmm1 # critical path
pxor %xmm3,%xmm2
pxor %xmm1,%xmm0
......@@ -1191,11 +2672,11 @@ __aesni_set_encrypt_key:
lea 16(%rax),%rax
.Lkey_expansion_256a_cold:
shufps \$0b00010000,%xmm0,%xmm4
pxor %xmm4,%xmm0
xorps %xmm4,%xmm0
shufps \$0b10001100,%xmm0,%xmm4
pxor %xmm4,%xmm0
pshufd \$0b11111111,%xmm1,%xmm1 # critical path
pxor %xmm1,%xmm0
xorps %xmm4,%xmm0
shufps \$0b11111111,%xmm1,%xmm1 # critical path
xorps %xmm1,%xmm0
ret
.align 16
......@@ -1204,13 +2685,14 @@ __aesni_set_encrypt_key:
lea 16(%rax),%rax
shufps \$0b00010000,%xmm2,%xmm4
pxor %xmm4,%xmm2
xorps %xmm4,%xmm2
shufps \$0b10001100,%xmm2,%xmm4
pxor %xmm4,%xmm2
pshufd \$0b10101010,%xmm1,%xmm1 # critical path
pxor %xmm1,%xmm2
xorps %xmm4,%xmm2
shufps \$0b10101010,%xmm1,%xmm1 # critical path
xorps %xmm1,%xmm2
ret
.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
___
}
......@@ -1219,9 +2701,12 @@ $code.=<<___;
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lincrement32:
.long 3,3,3,0
.long 6,6,6,0
.Lincrement64:
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
......@@ -1253,12 +2738,8 @@ ecb_se_handler:
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
jmp .Lcommon_seh_exit
jmp .Lcommon_seh_tail
.size ecb_se_handler,.-ecb_se_handler
.type ccm64_se_handler,\@abi-omnipotent
......@@ -1284,29 +2765,22 @@ ccm64_se_handler:
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lin_ccm64_prologue
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_ccm64_prologue
jae .Lcommon_seh_tail
lea 0(%rax),%rsi # top of stack
lea 0(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0x58(%rax),%rax # adjust stack pointer
.Lin_ccm64_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
jmp .Lcommon_seh_exit
jmp .Lcommon_seh_tail
.size ccm64_se_handler,.-ccm64_se_handler
.type ctr32_se_handler,\@abi-omnipotent
......@@ -1328,29 +2802,63 @@ ctr32_se_handler:
lea .Lctr32_body(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lin_ctr32_prologue
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
lea .Lctr32_ret(%rip),%r10
cmp %r10,%rbx
jae .Lin_ctr32_prologue
jae .Lcommon_seh_tail
lea 0(%rax),%rsi # top of stack
lea 0x20(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax)
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0x68(%rax),%rax # adjust stack pointer
.Lin_ctr32_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
lea 0xc8(%rax),%rax # adjust stack pointer
jmp .Lcommon_seh_exit
jmp .Lcommon_seh_tail
.size ctr32_se_handler,.-ctr32_se_handler
.type xts_se_handler,\@abi-omnipotent
.align 16
xts_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue lable
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
lea 0x60(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0x68+160(%rax),%rax # adjust stack pointer
jmp .Lcommon_seh_tail
.size xts_se_handler,.-xts_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
......@@ -1372,7 +2880,7 @@ cbc_se_handler:
lea .Lcbc_decrypt(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lin_cbc_prologue
jb .Lcommon_seh_tail
lea .Lcbc_decrypt_body(%rip),%r10
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
......@@ -1380,26 +2888,25 @@ cbc_se_handler:
lea .Lcbc_ret(%rip),%r10
cmp %r10,%rbx # context->Rip>="epilogue" label
jae .Lin_cbc_prologue
jae .Lcommon_seh_tail
lea 0(%rax),%rsi # top of stack
lea 512($context),%rdi # &context.Xmm6
mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0x58(%rax),%rax # adjust stack pointer
jmp .Lin_cbc_prologue
jmp .Lcommon_seh_tail
.Lrestore_cbc_rax:
mov 120($context),%rax
.Lin_cbc_prologue:
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
.Lcommon_seh_exit:
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
......@@ -1452,6 +2959,14 @@ $code.=<<___ if ($PREFIX eq "aesni");
.rva .LSEH_begin_aesni_ctr32_encrypt_blocks
.rva .LSEH_end_aesni_ctr32_encrypt_blocks
.rva .LSEH_info_ctr32
.rva .LSEH_begin_aesni_xts_encrypt
.rva .LSEH_end_aesni_xts_encrypt
.rva .LSEH_info_xts_enc
.rva .LSEH_begin_aesni_xts_decrypt
.rva .LSEH_end_aesni_xts_decrypt
.rva .LSEH_info_xts_dec
___
$code.=<<___;
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
......@@ -1483,6 +2998,14 @@ $code.=<<___ if ($PREFIX eq "aesni");
.LSEH_info_ctr32:
.byte 9,0,0,0
.rva ctr32_se_handler
.LSEH_info_xts_enc:
.byte 9,0,0,0
.rva xts_se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.LSEH_info_xts_dec:
.byte 9,0,0,0
.rva xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
___
$code.=<<___;
.LSEH_info_cbc:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册