diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl index 59a2d0da19053c0e5fe83fbdcb5fb118bbc210c4..e9ee3f2ee2fa6abf500c239de35026e1fa954695 100755 --- a/crypto/aes/asm/aes-586.pl +++ b/crypto/aes/asm/aes-586.pl @@ -6,7 +6,7 @@ # forms are granted according to the OpenSSL license. # ==================================================================== # -# Version 3.3. +# Version 3.4. # # You might fail to appreciate this module performance from the first # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered @@ -60,6 +60,12 @@ # misaligned, which unfortunately has negative impact on elder IA-32 # implementations, Pentium suffered 30% penalty, PIII - 10%. # +# Version 3.3 avoids L1 cache aliasing between stack frame and +# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The +# latter is achieved by copying the key schedule to controlled place in +# stack. This unfortunately has rather strong impact on small block CBC +# performance, ~2x deterioration on 16-byte block if compared to 3.3. +# # Current ECB performance numbers for 128-bit key in CPU cycles per # processed byte [measure commonly used by AES benchmarkers] are: # @@ -81,6 +87,12 @@ $s3="edx"; $key="edi"; $acc="esi"; +$compromise=0; # $compromise=128 abstains from copying key + # schedule to stack when encrypting inputs + # shorter than 128 bytes at the cost of + # risksing aliasing with S-boxes. In return + # you get way better, up to +70%, small block + # performance. $small_footprint=1; # $small_footprint=1 code is ~5% slower [on # recent µ-archs], but ~5 times smaller! # I favor compact code to minimize cache @@ -792,6 +804,7 @@ my $_key=&DWP(32,"esp"); #copy of wparam(3) my $_ivp=&DWP(36,"esp"); #copy of wparam(4) my $_tmp=&DWP(40,"esp"); #volatile variable my $ivec=&DWP(44,"esp"); #ivec[16] +my $aes_key=&DWP(60,"esp"); #copy of aes_key &public_label("AES_Te"); &public_label("AES_Td"); @@ -804,28 +817,37 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &set_label("pic_point"); &blindpop("ebp"); + &pushf (); + &cld (); + &cmp (&wparam(5),0); &je (&label("DECRYPT")); &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); # allocate aligned stack frame... - &lea ($key,&DWP(-44,"esp")); + &lea ($key,&DWP(-64-260,"esp")); &and ($key,-64); # ... and make sure it doesn't alias with AES_Te modulo 4096 - &mov ($s1,"ebp"); + &mov ($s0,"ebp"); + &lea ($s1,&DWP(2048,"ebp")); &mov ($s3,$key); - &and ($s1,0xfff); # t = %ebp&0xfff + &and ($s0,0xfff); # s = %ebp&0xfff + &and ($s1,0xfff); # e = (%ebp+2048)&0xfff &and ($s3,0xfff); # p = %esp&0xfff - &cmp ($s3,$s1); # if (p=(t+2048)) goto ok - &jae (&label("te_ok")); - &sub ($s1,$s3); # t -= p - &lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64 + &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); + &jb (&label("te_break_out")); + &sub ($s3,$s1); + &sub ($key,$s3); + &jmp (&label("te_ok")); + &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; + &sub ($s3,$s0); + &and ($s3,0xfff); + &add ($s3,64+320); + &sub ($key,$s3); + &align (4); &set_label("te_ok"); &mov ($s0,&wparam(0)); # load inp @@ -843,6 +865,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov ($_key,$s3); # save copy of key &mov ($_ivp,$acc); # save copy of ivp + if ($compromise) { + &cmp ($s2,$compromise); + &jb (&label("skip_ecopy")); + } + # copy key schedule to stack + &mov ("ecx",260/4); + &mov ("esi",$s3); + &lea ("edi",$aes_key); + &mov ($_key,"edi"); + &align (4); + &data_word(0xF689A5F3); # rep movsd + &set_label("skip_ecopy") if ($compromise); + &mov ($acc,$s0); &mov ($key,16); &align (4); @@ -906,28 +941,42 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov (&DWP(4,$acc),$s1); &mov (&DWP(8,$acc),$s2); &mov (&DWP(12,$acc),$s3); + + &mov ("edi",$_key); &mov ("esp",$_esp); + if ($compromise) { + &cmp (&wparam(2),$compromise); + &jb (&label("skip_ezero")); + } + # zero copy of key schedule + &mov ("ecx",256/4); + &xor ("eax","eax"); + &align (4); + &data_word(0xF689ABF3); # rep stosd + &set_label("skip_ezero") if ($compromise); + &popf (); &set_label("enc_out"); &function_end_A(); + &pushf (); # kludge, never executed &align (4); &set_label("enc_tail"); &push ($key eq "edi" ? $key : ""); # push ivp - &pushf (); &mov ($key,$_out); # load out &mov ($s1,16); &sub ($s1,$s2); &cmp ($key,$acc); # compare with inp &je (&label("enc_in_place")); - &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input + &align (4); + &data_word(0xF689A4F3); # rep movsb # copy input &jmp (&label("enc_skip_in_place")); &set_label("enc_in_place"); &lea ($key,&DWP(0,$key,$s2)); &set_label("enc_skip_in_place"); &mov ($s2,$s1); &xor ($s0,$s0); - &data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail - &popf (); + &align (4); + &data_word(0xF689AAF3); # rep stosb # zero tail &pop ($key); # pop ivp &mov ($acc,$_out); # output as input @@ -942,22 +991,28 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); # allocate aligned stack frame... - &lea ($key,&DWP(-64,"esp")); + &lea ($key,&DWP(-64-260,"esp")); &and ($key,-64); # ... and make sure it doesn't alias with AES_Td modulo 4096 - &mov ($s1,"ebp"); + &mov ($s0,"ebp"); + &lea ($s1,&DWP(3072,"ebp")); &mov ($s3,$key); - &and ($s1,0xfff); # t = %ebp&0xfff + &and ($s0,0xfff); # s = %ebp&0xfff + &and ($s1,0xfff); # e = (%ebp+3072)&0xfff &and ($s3,0xfff); # p = %esp&0xfff - &cmp ($s3,$s1); # if (p=(t+3072)) goto ok - &jae (&label("td_ok")); - &sub ($s1,$s3); # t -= p - &lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64 + &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); + &jb (&label("td_break_out")); + &sub ($s3,$s1); + &sub ($key,$s3); + &jmp (&label("td_ok")); + &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz; + &sub ($s3,$s0); + &and ($s3,0xfff); + &add ($s3,64+320); + &sub ($key,$s3); + &align (4); &set_label("td_ok"); &mov ($s0,&wparam(0)); # load inp @@ -975,6 +1030,19 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &mov ($_key,$s3); # save copy of key &mov ($_ivp,$acc); # save copy of ivp + if ($compromise) { + &cmp ($s2,$compromise); + &jb (&label("skip_dcopy")); + } + # copy key schedule to stack + &mov ("ecx",260/4); + &mov ("esi",$s3); + &lea ("edi",$aes_key); + &mov ($_key,"edi"); + &align (4); + &data_word(0xF689A5F3); # rep movsd + &set_label("skip_dcopy") if ($compromise); + &mov ($acc,$s0); &mov ($key,24); &align (4); @@ -1053,10 +1121,8 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); &mov ($acc eq "esi" ? $acc : "",$key); &mov ($key eq "edi" ? $key : "",$_out); # load out - &pushf (); - &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy output - &popf (); - &mov ($key,$_inp); # use inp as temp ivp + &data_word(0xF689A4F3); # rep movsb # copy output + &mov ($key,$_inp); # use inp as temp ivp &jmp (&label("dec_end")); &align (4); @@ -1122,13 +1188,23 @@ my $ivec=&DWP(44,"esp"); #ivec[16] &lea ($key,&DWP(0,$key,$s2)); &lea ($acc,&DWP(16,$acc,$s2)); &neg ($s2 eq "ecx" ? $s2 : ""); - &pushf (); - &data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail - &popf (); + &data_word(0xF689A4F3); # rep movsb # restore tail &align (4); &set_label("dec_out"); + &mov ("edi",$_key); &mov ("esp",$_esp); + if ($compromise) { + &cmp (&wparam(2),$compromise); + &jb (&label("skip_dzero")); + } + # zero copy of key schedule + &mov ("ecx",256/4); + &xor ("eax","eax"); + &align (4); + &data_word(0xF689ABF3); # rep stosd + &set_label("skip_dzero") if ($compromise); + &popf (); &function_end("AES_cbc_encrypt"); } diff --git a/crypto/perlasm/x86ms.pl b/crypto/perlasm/x86ms.pl index 35ba300d0456e44a78837a00ceae0084075f85a1..3a2b4960cf88de22f46e4346614d851393a9a10e 100644 --- a/crypto/perlasm/x86ms.pl +++ b/crypto/perlasm/x86ms.pl @@ -176,6 +176,7 @@ sub main'rdtsc { &out0("DW\t0310Fh"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } sub main'neg { &out1("neg",@_); } +sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl index b897ed0fda79d00b71ef1738feb64b72e6717baa..7d094283878f7e2644b6cd60c7d40a94fc6867e6 100644 --- a/crypto/perlasm/x86nasm.pl +++ b/crypto/perlasm/x86nasm.pl @@ -194,6 +194,7 @@ sub main'rdtsc { &out0("rdtsc"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } sub main'neg { &out1("neg",@_); } +sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } diff --git a/crypto/perlasm/x86unix.pl b/crypto/perlasm/x86unix.pl index 8414d10cfaf64d18ee8b5eed4c31cecad9e06b9e..305a55bf6291ebcf2ebca330385630a328a7159c 100644 --- a/crypto/perlasm/x86unix.pl +++ b/crypto/perlasm/x86unix.pl @@ -210,6 +210,7 @@ sub main'rdtsc { &out0(".byte\t0x0f,0x31"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzbl",@_); } sub main'neg { &out1("negl",@_); } +sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); }