diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index 72faa78d1fd93469a72a1350e97c040838f2a50e..8c1426cd5b6e38df7de189cfc08db2f7bd403d5e 100644 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -23,7 +23,8 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); -$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); +if ($PREFIX eq "aesni") { $movekey=*movaps; } +else { $movekey=*movups; } $len="eax"; $rounds="ecx"; @@ -41,7 +42,7 @@ $rndkey1="xmm4"; $ivec="xmm5"; $in0="xmm6"; $in1="xmm7"; $inout3="xmm7"; - + # Inline version of internal aesni_[en|de]crypt1 sub aesni_inline_generate1 { my $p=shift; @@ -104,7 +105,7 @@ sub aesni_generate1 # fully unrolled loop &ret(); &function_end_B("_aesni_${p}rypt1"); } - + # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); &aesni_generate1("enc") if (!$inline); &function_begin_B("${PREFIX}_encrypt"); @@ -136,7 +137,7 @@ sub aesni_generate1 # fully unrolled loop &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); - + # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] # latency is 6, it turned out that it can be scheduled only every @@ -229,8 +230,9 @@ sub aesni_generate4 &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); - + if ($PREFIX eq "aesni") { +###################################################################### # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); @@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") { &mov ($rounds_,$rounds); # backup $rounds &jz (&label("ecb_decrypt")); - &sub ($len,0x40); + &cmp ($len,0x40); &jbe (&label("ecb_enc_tail")); + &sub ($len,0x40); &jmp (&label("ecb_enc_loop3")); &set_label("ecb_enc_loop3",16); @@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_enc_loop3")); -&set_label("ecb_enc_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); - &cmp ($len,0x10); - &movups ($inout0,&QWP(0,$inp)); - &je (&label("ecb_enc_one")); +&set_label("ecb_enc_tail"); &cmp ($len,0x20); + &movups ($inout0,&QWP(0,$inp)); + &jb (&label("ecb_enc_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &cmp ($len,0x30); @@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); - +###################################################################### &set_label("ecb_decrypt",16); - &sub ($len,0x40); + &cmp ($len,0x40); &jbe (&label("ecb_dec_tail")); + &sub ($len,0x40); &jmp (&label("ecb_dec_loop3")); &set_label("ecb_dec_loop3",16); @@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_dec_loop3")); -&set_label("ecb_dec_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); - &cmp ($len,0x10); - &movups ($inout0,&QWP(0,$inp)); - &je (&label("ecb_dec_one")); +&set_label("ecb_dec_tail"); &cmp ($len,0x20); + &movups ($inout0,&QWP(0,$inp)); + &jb (&label("ecb_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &cmp ($len,0x30); @@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") { &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); -} + +###################################################################### +# handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see engine/eng_aesni.c for details) +# +# void aesni_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +&function_begin("aesni_ctr32_encrypt_blocks"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($key_,"esp"); + &sub ("esp",60); + &and ("esp",-16); # align stack + &mov (&DWP(48,"esp"),$key_); + + &movups ($inout3,&QWP(0,$rounds_)); # load ivec + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds,3); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(20,"esp"),$rounds); + &mov (&DWP(24,"esp"),$rounds); + &mov (&DWP(28,"esp"),$key_); + + &pextrd ($rounds_,$inout3,3); # pull 32-bit counter + &pinsrd ($inout3,$key_,3); # wipe 32-bit counter + + &mov ($rounds,&DWP(240,$key)); # key->rounds + &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask + + # $ivec is vector of 3 32-bit counters + &pxor ($ivec,$ivec); + &bswap ($rounds_); + &pinsrd ($ivec,$rounds_,0); + &inc ($rounds_); + &pinsrd ($ivec,$rounds_,1); + &inc ($rounds_); + &pinsrd ($ivec,$rounds_,2); + + &cmp ($len,4); + &pshufb ($ivec,$rndkey0); # byte swap + &jbe (&label("ctr32_tail")); + &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec + &mov ($rounds_,$rounds); + &mov ($key_,$key); + &sub ($len,4); + &jmp (&label("ctr32_loop3")); + +&set_label("ctr32_loop3",16); + &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword + &pshufd ($inout1,$ivec,2<<6); + &pshufd ($inout2,$ivec,1<<6); + &por ($inout0,$inout3); # merge counter-less ivec + &por ($inout1,$inout3); + &por ($inout2,$inout3); + + &call ("_aesni_encrypt3"); + + &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask + &movups ($in0,&QWP(0,$inp)); + &movups ($in1,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); + &pshufb($ivec,$rndkey0); # byte swap + &paddd ($ivec,&QWP(16,"esp")); # counter increment + &pxor ($in0,$inout0); + &pxor ($in1,$inout1); + &pxor ($rndkey1,$inout2); + &movups (&QWP(0,$out),$in0); + &movups (&QWP(0x10,$out),$in1); + &movups (&QWP(0x20,$out),$rndkey1); + &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec + &pshufb($ivec,$rndkey0); # byte swap + + &sub ($len,3); + &lea ($inp,&DWP(0x30,$inp)); + &lea ($out,&DWP(0x30,$out)); + &mov ($key,$key_); + &mov ($rounds,$rounds_); + &ja (&label("ctr32_loop3")); + + &add ($len,4); + &pextrd ($rounds_,$ivec,1); # might need last counter value + &jz (&label("ctr32_ret")); + &bswap ($rounds_); + +&set_label("ctr32_tail"); + &cmp ($len,2); + &pshufd ($inout0,$ivec,3<<6); + &pshufd ($inout1,$ivec,2<<6); + &pshufd ($inout2,$ivec,1<<6); + &por ($inout0,$inout3); + &jb (&label("ctr32_one")); + &por ($inout1,$inout3); + &je (&label("ctr32_two")); + &cmp ($len,3); + &por ($inout2,$inout3); + &je (&label("ctr32_three")); + + &inc ($rounds_); # compose last counter value + &bswap ($rounds_); + &pinsrd ($inout3,$rounds_,3); + + &call ("_aesni_encrypt4"); + + &movups ($in0,&QWP(0,$inp)); + &movups ($rndkey1,&QWP(0x10,$inp)); + &movups ($rndkey0,&QWP(0x20,$inp)); + &movups ($ivec,&QWP(0x30,$inp)); + &pxor ($in0,$inout0); + &pxor ($rndkey1,$inout1); + &pxor ($rndkey0,$inout2); + &pxor ($ivec,$inout3); + &movups (&QWP(0,$out),$in0); + &movups (&QWP(0x10,$out),$rndkey1); + &movups (&QWP(0x20,$out),$rndkey0); + &movups (&QWP(0x30,$out),$ivec); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_one",16); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups ($in0,&QWP(0,$inp)); + &pxor ($in0,$inout0); + &movups (&QWP(0,$out),$in0); + &jmp (&label("ctr32_ret")); +&set_label("ctr32_two",16); + &call ("_aesni_encrypt3"); + &movups ($in0,&QWP(0,$inp)); + &movups ($in1,&QWP(0x10,$inp)); + &pxor ($in0,$inout0); + &pxor ($in1,$inout1); + &movups (&QWP(0,$out),$in0); + &movups (&QWP(0x10,$out),$in1); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_three",16); + &call ("_aesni_encrypt3"); + &movups ($in0,&QWP(0,$inp)); + &movups ($in1,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); + &pxor ($in0,$inout0); + &pxor ($in1,$inout1); + &pxor ($rndkey1,$inout2); + &movups (&QWP(0,$out),$in0); + &movups (&QWP(0x10,$out),$in1); + &movups (&QWP(0x20,$out),$rndkey1); + +&set_label("ctr32_ret"); + &mov ("esp",&DWP(48,"esp")); +&function_end("aesni_ctr32_encrypt_blocks"); +} + +###################################################################### # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); @@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") { &mov ($inp,$out); # $inp and $out are the same &mov ($key,$key_); # restore $key &jmp (&label("cbc_enc_loop")); - +###################################################################### &set_label("cbc_decrypt",16); - &sub ($len,0x40); + &cmp ($len,0x40); &jbe (&label("cbc_dec_tail")); + &sub ($len,0x40); &jmp (&label("cbc_dec_loop3")); &set_label("cbc_dec_loop3",16); @@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") { &movups (&QWP(-0x10,$out),$inout2); &ja (&label("cbc_dec_loop3")); -&set_label("cbc_dec_tail"); &add ($len,0x40); &jz (&label("cbc_ret")); +&set_label("cbc_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x10); &movaps ($in0,$inout0); @@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") { &mov ($key_,&wparam(4)); &movups (&QWP(0,$key_),$ivec); # output IV &function_end("${PREFIX}_cbc_encrypt"); - + +###################################################################### # Mechanical port from aesni-x86_64.pl. # # _aesni_set_encrypt_key is private interface, diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index cdc076e24f502f3e6cd8ae7bcfa04e3aa28cbd97..d8697519e40ead7e8a367f2e5db2bb6b1e87c93f 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -41,7 +41,7 @@ $inp="%rdi"; $out="%rsi"; $len="%rdx"; $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! -$ivp="%r8"; # cbc +$ivp="%r8"; # cbc, ctr $rnds_="%r10d"; # backup copy for $rounds $key_="%r11"; # backup copy for $key @@ -51,7 +51,7 @@ $inout0="%xmm0"; $inout1="%xmm1"; $inout2="%xmm2"; $inout3="%xmm3"; $rndkey0="%xmm4"; $rndkey1="%xmm5"; -$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt +$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR $in1="%xmm8"; $in2="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. @@ -214,6 +214,7 @@ ___ &aesni_generate4("dec"); if ($PREFIX eq "aesni") { +######################################################################## # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); @@ -232,8 +233,9 @@ aesni_ecb_encrypt: mov $rounds,$rnds_ # backup $rounds jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# - sub \$0x40,$len + cmp \$0x40,$len jbe .Lecb_enc_tail + sub \$0x40,$len jmp .Lecb_enc_loop3 .align 16 .Lecb_enc_loop3: @@ -251,14 +253,13 @@ aesni_ecb_encrypt: movups $inout2,-0x10($out) ja .Lecb_enc_loop3 -.Lecb_enc_tail: add \$0x40,$len jz .Lecb_ret - cmp \$0x10,$len - movups ($inp),$inout0 - je .Lecb_enc_one +.Lecb_enc_tail: cmp \$0x20,$len + movups ($inp),$inout0 + jb .Lecb_enc_one movups 0x10($inp),$inout1 je .Lecb_enc_two cmp \$0x30,$len @@ -294,8 +295,9 @@ $code.=<<___; #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: - sub \$0x40,$len + cmp \$0x40,$len jbe .Lecb_dec_tail + sub \$0x40,$len jmp .Lecb_dec_loop3 .align 16 .Lecb_dec_loop3: @@ -313,14 +315,13 @@ $code.=<<___; movups $inout2,-0x10($out) ja .Lecb_dec_loop3 -.Lecb_dec_tail: add \$0x40,$len jz .Lecb_ret - cmp \$0x10,$len - movups ($inp),$inout0 - je .Lecb_dec_one +.Lecb_dec_tail: cmp \$0x20,$len + movups ($inp),$inout0 + jb .Lecb_dec_one movups 0x10($inp),$inout1 je .Lecb_dec_two cmp \$0x30,$len @@ -357,8 +358,175 @@ $code.=<<___; ret .size aesni_ecb_encrypt,.-aesni_ecb_encrypt ___ +###################################################################### +# handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see engine/eng_aesni.c for details) +# +# void aesni_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +$increment="%xmm10"; +$bswap_mask="%xmm11"; + +$code.=<<___; +.globl aesni_ctr32_encrypt_blocks +.type aesni_ctr32_encrypt_blocks,\@function,5 +.align 16 +aesni_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($win64); + lea -0x68(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + +.Lctr32_body: +___ +$code.=<<___; + movups ($ivp),$inout3 + movaps .Lincrement(%rip),$increment + movaps .Lbswap_mask(%rip),$bswap_mask + xor $rounds,$rounds + pextrd \$3,$inout3,$rnds_ # pull 32-bit counter + pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter + + mov 240($key),$rounds # key->rounds + pxor $iv,$iv # vector of 3 32-bit counters + bswap $rnds_ + pinsrd \$0,$rnds_,$iv + inc $rnds_ + pinsrd \$1,$rnds_,$iv + inc $rnds_ + pinsrd \$2,$rnds_,$iv + + cmp \$4,$len + pshufb $bswap_mask,$iv + jbe .Lctr32_tail + mov $rounds,$rnds_ + mov $key,$key_ + sub \$4,$len + jmp .Lctr32_loop3 + +.align 16 +.Lctr32_loop3: + pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword + pshufd \$`2<<6`,$iv,$inout1 + pshufd \$`1<<6`,$iv,$inout2 + movups ($inp),$in0 + movups 0x10($inp),$in1 + movups 0x20($inp),$in2 + por $inout3,$inout0 # merge counter-less ivec + por $inout3,$inout1 + por $inout3,$inout2 + pshufb $bswap_mask,$iv + + call _aesni_encrypt3 + + paddd $increment,$iv + pxor $inout0,$in0 + pxor $inout1,$in1 + pxor $inout2,$in2 + pshufb $bswap_mask,$iv + movups $in0,($out) + movups $in1,0x10($out) + movups $in2,0x20($out) + + sub \$3,$len + lea 0x30($inp),$inp + lea 0x30($out),$out + mov $key_,$key + mov $rnds_,$rounds + ja .Lctr32_loop3 + + add \$4,$len + pextrd \$1,$iv,$rnds_ # migh need last counter value + jz .Lctr32_done + bswap $rnds_ + +.Lctr32_tail: + cmp \$2,$len + pshufd \$`3<<6`,$iv,$inout0 + pshufd \$`2<<6`,$iv,$inout1 + pshufd \$`1<<6`,$iv,$inout2 + por $inout3,$inout0 + movups ($inp),$in0 + jb .Lctr32_one + por $inout3,$inout1 + movups 0x10($inp),$in1 + je .Lctr32_two + cmp \$3,$len + por $inout3,$inout2 + movups 0x20($inp),$in2 + je .Lctr32_three + + inc $rnds_ # compose last counter value + bswap $rnds_ + pinsrd \$3,$rnds_,$inout3 + movups 0x30($inp),$iv + + call _aesni_encrypt4 + + pxor $inout0,$in0 + pxor $inout1,$in1 + pxor $inout2,$in2 + pxor $inout3,$iv + movups $in0,($out) + movups $in1,0x10($out) + movups $in2,0x20($out) + movups $iv,0x30($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_one: +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + pxor $inout0,$in0 + movups $in0,($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_two: + call _aesni_encrypt3 + pxor $inout0,$in0 + pxor $inout1,$in1 + movups $in0,($out) + movups $in1,0x10($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_three: + call _aesni_encrypt3 + pxor $inout0,$in0 + pxor $inout1,$in1 + pxor $inout2,$in2 + movups $in0,($out) + movups $in1,0x10($out) + movups $in2,0x20($out) + +.Lctr32_done: +___ + +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + lea 0x68(%rsp),%rsp +___ +$code.=<<___; +.Lctr32_ret: + ret +.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks +___ } +######################################################################## # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); @@ -429,9 +597,10 @@ $code.=<<___ if ($win64); ___ $code.=<<___; movups ($ivp),$iv - sub \$0x40,$len + cmp \$0x40,$len mov $rnds_,$rounds jbe .Lcbc_dec_tail + sub \$0x40,$len jmp .Lcbc_dec_loop3 .align 16 .Lcbc_dec_loop3: @@ -456,11 +625,11 @@ $code.=<<___; movups $inout2,-0x10($out) ja .Lcbc_dec_loop3 -.Lcbc_dec_tail: add \$0x40,$len movups $iv,($ivp) jz .Lcbc_dec_ret +.Lcbc_dec_tail: movups ($inp),$inout0 cmp \$0x10,$len movaps $inout0,$in0 @@ -796,6 +965,11 @@ ___ } $code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lincrement: + .long 3,3,3,0 .asciz "AES for Intel AES-NI, CRYPTOGAMS by " .align 64 ___ @@ -810,9 +984,11 @@ $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind -.type cbc_se_handler,\@abi-omnipotent +___ +$code.=<<___ if ($PREFIX eq "aesni"); +.type ecb_se_handler,\@abi-omnipotent .align 16 -cbc_se_handler: +ecb_se_handler: push %rsi push %rdi push %rbx @@ -825,30 +1001,48 @@ cbc_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + jmp .Lcommon_seh_exit +.size ecb_se_handler,.-ecb_se_handler + +.type ctr32_se_handler,\@abi-omnipotent +.align 16 +ctr32_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip - lea .Lcbc_decrypt(%rip),%r10 + lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lin_prologue + jb .Lin_ctr32_prologue - lea .Lcbc_decrypt_body(%rip),%r10 - cmp %r10,%rbx # context->RipRsp - lea .Lcbc_ret(%rip),%r10 - cmp %r10,%rbx # context->Rip>="epilogue" label - jae .Lin_prologue + lea .Lctr32_ret(%rip),%r10 + cmp %r10,%rbx + jae .Lin_ctr32_prologue lea 0(%rax),%rsi # top of stack lea 512($context),%rdi # &context.Xmm6 - mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) + mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0x58(%rax),%rax # adjust stack pointer - jmp .Lin_prologue + lea 0x68(%rax),%rax # adjust stack pointer -.Lrestore_rax: - mov 120($context),%rax -.Lin_prologue: +.Lin_ctr32_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp @@ -856,11 +1050,12 @@ cbc_se_handler: mov %rdi,176($context) # restore context->Rdi jmp .Lcommon_seh_exit -.size cbc_se_handler,.-cbc_se_handler - -.type ecb_se_handler,\@abi-omnipotent +.size ctr32_se_handler,.-ctr32_se_handler +___ +$code.=<<___; +.type cbc_se_handler,\@abi-omnipotent .align 16 -ecb_se_handler: +cbc_se_handler: push %rsi push %rdi push %rbx @@ -873,8 +1068,33 @@ ecb_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp + mov 248($context),%rbx # pull context->Rip + + lea .Lcbc_decrypt(%rip),%r10 + cmp %r10,%rbx # context->Rip<"prologue" label + jb .Lin_cbc_prologue + + lea .Lcbc_decrypt_body(%rip),%r10 + cmp %r10,%rbx # context->RipRip>="epilogue" label + jae .Lin_cbc_prologue + + lea 0(%rax),%rsi # top of stack + lea 512($context),%rdi # &context.Xmm6 + mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0x58(%rax),%rax # adjust stack pointer + jmp .Lin_cbc_prologue + +.Lrestore_cbc_rax: + mov 120($context),%rax +.Lin_cbc_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi @@ -915,10 +1135,17 @@ ecb_se_handler: .section .pdata .align 4 - .rva .LSEH_begin_${PREFIX}_ecb_encrypt - .rva .LSEH_end_${PREFIX}_ecb_encrypt +___ +$code.=<<___ if ($PREFIX eq "aesni"); + .rva .LSEH_begin_aesni_ecb_encrypt + .rva .LSEH_end_aesni_ecb_encrypt .rva .LSEH_info_ecb + .rva .LSEH_begin_aesni_ctr32_encrypt_blocks + .rva .LSEH_end_aesni_ctr32_encrypt_blocks + .rva .LSEH_info_ctr32 +___ +$code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt .rva .LSEH_end_${PREFIX}_cbc_encrypt .rva .LSEH_info_cbc @@ -932,9 +1159,16 @@ ecb_se_handler: .rva .LSEH_info_key .section .xdata .align 8 +___ +$code.=<<___ if ($PREFIX eq "aesni"); .LSEH_info_ecb: .byte 9,0,0,0 .rva ecb_se_handler +.LSEH_info_ctr32: + .byte 9,0,0,0 + .rva ctr32_se_handler +___ +$code.=<<___; .LSEH_info_cbc: .byte 9,0,0,0 .rva cbc_se_handler diff --git a/crypto/engine/eng_aesni.c b/crypto/engine/eng_aesni.c index 2a997cae36dc0cf1daf44901d200af0bc2ff3f27..70b2838b4ea13d81191e59c962c13e3b09aa5151 100644 --- a/crypto/engine/eng_aesni.c +++ b/crypto/engine/eng_aesni.c @@ -111,6 +111,35 @@ void ENGINE_load_aesni (void) } #ifdef COMPILE_HW_AESNI + +typedef unsigned int u32; +typedef unsigned char u8; + +#if defined(__GNUC__) && __GNUC__>=2 +# define BSWAP4(x) ({ u32 ret=(x); \ + asm volatile ("bswapl %0" \ + : "+r"(ret)); ret; }) +#elif defined(_MSC_VER) +# if _MSC_VER>=1300 +# pragma intrinsic(_byteswap_ulong) +# define BSWAP4(x) _byteswap_ulong((u32)(x)) +# elif defined(_M_IX86) + __inline u32 _bswap4(u32 val) { + _asm mov eax,val + _asm bswap eax + } +# define BSWAP4(x) _bswap4(x) +# endif +#endif + +#ifdef BSWAP4 +#define GETU32(p) BSWAP4(*(const u32 *)(p)) +#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) +#else +#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) +#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) +#endif + int aesni_set_encrypt_key(const unsigned char *userKey, int bits, AES_KEY *key); int aesni_set_decrypt_key(const unsigned char *userKey, int bits, @@ -132,6 +161,12 @@ void aesni_cbc_encrypt(const unsigned char *in, const AES_KEY *key, unsigned char *ivec, int enc); +void aesni_ctr32_encrypt_blocks(const unsigned char *in, + unsigned char *out, + size_t blocks, + const AES_KEY *key, + const unsigned char *ivec); + /* Function for ENGINE detection and control */ static int aesni_init(ENGINE *e); @@ -224,16 +259,19 @@ static int aesni_cipher_nids[] = { NID_aes_128_cbc, NID_aes_128_cfb, NID_aes_128_ofb, + NID_aes_128_ctr, NID_aes_192_ecb, NID_aes_192_cbc, NID_aes_192_cfb, NID_aes_192_ofb, + NID_aes_192_ctr, NID_aes_256_ecb, NID_aes_256_cbc, NID_aes_256_cfb, NID_aes_256_ofb, + NID_aes_256_ctr, }; static int aesni_cipher_nids_num = (sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0])); @@ -251,18 +289,28 @@ aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key, int ret; AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); - if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE - || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE - || enc) - ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); - else + if (((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_ECB_MODE + || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CBC_MODE) + && !enc) ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key); + else + ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); if(ret < 0) { EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); return 0; } + if (ctx->cipher->flags&EVP_CIPH_CUSTOM_IV) + { + if (iv!=NULL) + memcpy (ctx->iv,iv,ctx->cipher->iv_len); + else { + EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_IV_SETUP_FAILED); + return 0; + } + } + return 1; } @@ -336,6 +384,117 @@ DECLARE_AES_EVP(256,cbc,CBC); DECLARE_AES_EVP(256,cfb,CFB); DECLARE_AES_EVP(256,ofb,OFB); +static void ctr96_inc(unsigned char *counter) { + u32 n=12; + u8 c; + + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) return; + } while (n); +} + +static int aesni_counter(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) +{ + AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + u32 n, ctr32; + n = ctx->num; + + while (n && len) { + *(out++) = *(in++) ^ ctx->buf[n]; + --len; + n = (n+1) % 16; + } + + ctr32 = GETU32(ctx->iv+12); + while (len>=16) { + size_t blocks = len/16; + /* + * 1<<24 is just a not-so-small yet not-so-large number... + */ + if (blocks > (1U<<24)) blocks = (1U<<24); + /* + * As aesni_ctr32 operates on 32-bit counter, caller + * has to handle overflow. 'if' below detects the + * overflow, which is then handled by limiting the + * amount of blocks to the exact overflow point... + */ + ctr32 += (u32)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + aesni_ctr32_encrypt_blocks(in,out,blocks,key,ctx->iv); + /* aesni_ctr32 does not update ctx->iv, caller does: */ + PUTU32(ctx->iv+12,ctr32); + /* ... overflow was detected, propogate carry. */ + if (ctr32 == 0) ctr96_inc(ctx->iv); + blocks *= 16; + len -= blocks; + out += blocks; + in += blocks; + } + if (len) { + aesni_encrypt(ctx->iv,ctx->buf,key); + ++ctr32; + PUTU32(ctx->iv+12,ctr32); + if (ctr32 == 0) ctr96_inc(ctx->iv); + while (len--) { + out[n] = in[n] ^ ctx->buf[n]; + ++n; + } + } + ctx->num = n; + + return 1; +} + +static const EVP_CIPHER aesni_128_ctr= + { + NID_aes_128_ctr,1,16,16, + EVP_CIPH_CUSTOM_IV, + aesni_init_key, + aesni_counter, + NULL, + sizeof(AESNI_KEY), + NULL, + NULL, + NULL, + NULL + }; + +static const EVP_CIPHER aesni_192_ctr= + { + NID_aes_192_ctr,1,24,16, + EVP_CIPH_CUSTOM_IV, + aesni_init_key, + aesni_counter, + NULL, + sizeof(AESNI_KEY), + NULL, + NULL, + NULL, + NULL + }; + +static const EVP_CIPHER aesni_256_ctr= + { + NID_aes_256_ctr,1,32,16, + EVP_CIPH_CUSTOM_IV, + aesni_init_key, + aesni_counter, + NULL, + sizeof(AESNI_KEY), + NULL, + NULL, + NULL, + NULL + }; + static int aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid) @@ -360,6 +519,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_128_ofb: *cipher = &aesni_128_ofb; break; + case NID_aes_128_ctr: + *cipher = &aesni_128_ctr; + break; case NID_aes_192_ecb: *cipher = &aesni_192_ecb; @@ -373,6 +535,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_192_ofb: *cipher = &aesni_192_ofb; break; + case NID_aes_192_ctr: + *cipher = &aesni_192_ctr; + break; case NID_aes_256_ecb: *cipher = &aesni_256_ecb; @@ -386,6 +551,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, case NID_aes_256_ofb: *cipher = &aesni_256_ofb; break; + case NID_aes_256_ctr: + *cipher = &aesni_256_ctr; + break; default: /* Sorry, we don't support this NID */