diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index 1dbeff09c269468a6cc0cb27eae0355f988f9648..d93b675809fd083ad48f84f86910eca22ebf3e74 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -23,7 +23,7 @@ # PIII 63 /77 16 24 # P4 96 /122 30 84(***) # Opteron 50 /71 21 30 -# Core2 54 /68 13 18 +# Core2 54 /68 12.5 18 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, @@ -33,124 +33,84 @@ # position-independent; # (***) see comment in non-MMX routine for further details; # -# To summarize, it's 2-3 times faster than gcc-generated code. To +# To summarize, it's >2-3 times faster than gcc-generated code. To # anchor it to something else SHA1 assembler processes one byte in # 11-13 cycles on contemporary x86 cores. +# May 2010 +# +# Add PCLMULQDQ version performing at 2.13 cycles per processed byte. +# The question is how close is it to theoretical limit? The pclmulqdq +# instruction latency appears to be 14 cycles and there can't be more +# than 2 of them executing at any given time. This means that single +# Karatsuba multiplication would take 28 cycles *plus* few cycles for +# pre- and post-processing. Then multiplication has to be followed by +# modulo-reduction. Given that aggregated reduction method [see +# "Carry-less Multiplication and Its Usage for Computing the GCM Mode" +# white paper by Intel] allows you to perform reduction only once in +# a while we can assume that asymptotic performance can be estimated +# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction +# and Naggr is the aggregation factor. +# +# Before we proceed to this implementation let's have closer look at +# the best-performing code suggested by Intel in their white paper. +# By tracing inter-register dependencies Tmod is estimated as ~19 +# cycles and Naggr is 4, resulting in 2.05 cycles per processed byte. +# As implied, this is quite optimistic estimate, because it does not +# account for Karatsuba pre- and post-processing, which for a single +# multiplication is ~5 cycles. Unfortunately Intel does not provide +# performance data for GHASH alone, only for fused GCM mode. But +# we can estimate it by subtracting CTR performance result provided +# in "AES Instruction Set" white paper: 3.54-1.38=2.16 cycles per +# processed byte or 5% off the estimate. It should be noted though +# that 3.54 is GCM result for 16KB block size, while 1.38 is CTR for +# 1KB block size, meaning that real number is likely to be a bit +# further from estimate. +# +# Moving on to the implementation in question. Tmod is estimated as +# ~13 cycles and Naggr is 2, giving asymptotic performance of ... +# 2.16. How is it possible that measured performance is better than +# optimistic theoretical estimate? There is one thing Intel failed +# to recognize. By fusing GHASH with CTR former's performance is +# really limited to above (Tmul + Tmod/Naggr) equation. But if GHASH +# procedure is detached, the modulo-reduction can be interleaved with +# Naggr-1 multiplications and under ideal conditions even disappear +# from the equation. So that optimistic theoretical estimate for this +# implementation is ... 28/16=1.75, and not 2.16. Well, it's probably +# way too optimistic, at least for such small Naggr. I'd argue that +# (28+Tproc/Naggr), where Tproc is time required for Karatsuba pre- +# and post-processing, is more realistic estimate. In this case it +# gives ... 1.91 cycles per processed byte. Or in other words, +# depending on how well we can interleave reduction and one of the +# two multiplications the performance should be betwen 1.91 and 2.16. +# As already mentioned, this implementation processes one byte [out +# of 1KB buffer] in 2.13 cycles, while x86_64 counterpart - in 2.07. +# x86_64 performance is better, because larger register bank allows +# to interleave reduction and multiplication better. +# +# Does it make sense to increase Naggr? To start with it's virtually +# impossible in 32-bit mode, because of limited register bank +# capacity. Otherwise improvement has to be weighed agiainst slower +# setup, as well as code size and complexity increase. As even +# optimistic estimate doesn't promise 30% performance improvement, +# there are currently no plans to increase Naggr. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"gcm-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); +&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); -&static_label("rem_4bit") if (!$x86only); +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } -$Zhh = "ebp"; -$Zhl = "edx"; -$Zlh = "ecx"; -$Zll = "ebx"; +($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); $inp = "edi"; $Htbl = "esi"; - + $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse # than unrolled, which has to be weighted against - # 1.7x code size reduction. Well, *overall* 1.7x, - # x86-specific code itself shrinks by 2.5x... - -sub mmx_loop() { -# MMX version performs 2.8 times better on P4 (see comment in non-MMX -# routine for further details), 40% better on Opteron, 50% better -# on PIII and Core2... In other words effort is considered to be well -# spent... - my $inp = shift; - my $rem_4bit = shift; - my $cnt = $Zhh; - my $nhi = $Zhl; - my $nlo = $Zlh; - my $rem = $Zll; - - my $Zlo = "mm0"; - my $Zhi = "mm1"; - my $tmp = "mm2"; - - &xor ($nlo,$nlo); # avoid partial register stalls on PIII - &mov ($nhi,$Zll); - &mov (&LB($nlo),&LB($nhi)); - &mov ($cnt,14); - &shl (&LB($nlo),4); - &and ($nhi,0xf0); - &movq ($Zlo,&QWP(8,$Htbl,$nlo)); - &movq ($Zhi,&QWP(0,$Htbl,$nlo)); - &movd ($rem,$Zlo); - &jmp (&label("mmx_loop")); - - &set_label("mmx_loop",16); - &psrlq ($Zlo,4); - &and ($rem,0xf); - &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &mov (&LB($nlo),&BP(0,$inp,$cnt)); - &dec ($cnt); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); - &mov ($nhi,$nlo); - &pxor ($Zlo,$tmp); - &js (&label("mmx_break")); - - &shl (&LB($nlo),4); - &and ($rem,0xf); - &psrlq ($Zlo,4); - &and ($nhi,0xf0); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); - &pxor ($Zlo,$tmp); - &jmp (&label("mmx_loop")); - - &set_label("mmx_break",16); - &shl (&LB($nlo),4); - &and ($rem,0xf); - &psrlq ($Zlo,4); - &and ($nhi,0xf0); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); - &pxor ($Zlo,$tmp); - - &psrlq ($Zlo,4); - &and ($rem,0xf); - &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); - &mov ($nhi,$nlo); - &pxor ($Zlo,$tmp); - - &psrlq ($Zlo,32); # lower part of Zlo is already there - &movd ($Zhl,$Zhi); - &psrlq ($Zhi,32); - &movd ($Zlh,$Zlo); - &movd ($Zhh,$Zhi); - - &bswap ($Zll); - &bswap ($Zhl); - &bswap ($Zlh); - &bswap ($Zhh); -} + # 2.5x x86-specific code size reduction. sub x86_loop { my $off = shift; @@ -245,33 +205,30 @@ if ($unroll) { &function_end_B("_x86_gmult_4bit_inner"); } -&function_begin("gcm_gmult_4bit"); - if (!$x86only) { - &call (&label("pic_point")); - &set_label("pic_point"); - &blindpop("eax"); - &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); - &bt (&DWP(0,"ebp"),23); # check for MMX bit - &jnc (&label("x86")); - - &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); - - &mov ($inp,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - - &movz ($Zll,&BP(15,$inp)); - - &mmx_loop($inp,"eax"); +sub deposit_rem_4bit { + my $bias = shift; - &emms (); - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(0,$inp),$Zhh); + &mov (&DWP($bias+0, "esp"),0x0000<<16); + &mov (&DWP($bias+4, "esp"),0x1C20<<16); + &mov (&DWP($bias+8, "esp"),0x3840<<16); + &mov (&DWP($bias+12,"esp"),0x2460<<16); + &mov (&DWP($bias+16,"esp"),0x7080<<16); + &mov (&DWP($bias+20,"esp"),0x6CA0<<16); + &mov (&DWP($bias+24,"esp"),0x48C0<<16); + &mov (&DWP($bias+28,"esp"),0x54E0<<16); + &mov (&DWP($bias+32,"esp"),0xE100<<16); + &mov (&DWP($bias+36,"esp"),0xFD20<<16); + &mov (&DWP($bias+40,"esp"),0xD940<<16); + &mov (&DWP($bias+44,"esp"),0xC560<<16); + &mov (&DWP($bias+48,"esp"),0x9180<<16); + &mov (&DWP($bias+52,"esp"),0x8DA0<<16); + &mov (&DWP($bias+56,"esp"),0xA9C0<<16); + &mov (&DWP($bias+60,"esp"),0xB5E0<<16); +} + +$suffix = $x86only ? "" : "_x86"; - &function_end_A(); - &set_label("x86",16); - } +&function_begin("gcm_gmult_4bit".$suffix); &stack_push(16+4+1); # +1 for stack alignment &mov ($inp,&wparam(0)); # load Xi &mov ($Htbl,&wparam(1)); # load Htable @@ -302,32 +259,195 @@ if ($unroll) { &mov (&DWP(4,$inp),$Zhl); &mov (&DWP(0,$inp),$Zhh); &stack_pop(16+4+1); -&function_end("gcm_gmult_4bit"); +&function_end("gcm_gmult_4bit".$suffix); + +&function_begin("gcm_ghash_4bit".$suffix); + &stack_push(16+4+1); # +1 for 64-bit alignment + &mov ($Zll,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ("ecx",&wparam(3)); # load len + &add ("ecx",$inp); + &mov (&wparam(3),"ecx"); + + &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zll)); + &mov ($Zlh,&DWP(8,$Zll)); + &mov ($Zll,&DWP(12,$Zll)); + + &deposit_rem_4bit(16); + + &set_label("x86_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); # xor with input + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&DWP(12,"esp"),$Zll); # dump it on stack + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(0,"esp"),$Zhh); + + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(2)); + } + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &mov (&wparam(2),$inp) if (!$unroll); + &jb (&label("x86_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_ghash_4bit".$suffix); + +if (!$x86only) {{{ + +&static_label("rem_4bit"); + +sub mmx_loop() { +# MMX version performs 2.8 times better on P4 (see comment in non-MMX +# routine for further details), 40% better on Opteron and Core2, 50% +# better on PIII... In other words effort is considered to be well +# spent... + my $inp = shift; + my $rem_4bit = shift; + my $cnt = $Zhh; + my $nhi = $Zhl; + my $nlo = $Zlh; + my $rem = $Zll; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &mov ($cnt,14); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem,$Zlo); + &jmp (&label("mmx_loop")); + + &set_label("mmx_loop",16); + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &mov (&LB($nlo),&BP(0,$inp,$cnt)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &dec ($cnt); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); + &pxor ($Zlo,$tmp); + &js (&label("mmx_break")); + + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + &jmp (&label("mmx_loop")); + + &set_label("mmx_break",16); + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &bswap ($Zhh); +} + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable -# Streamed version performs 20% better on P4, 7% on Opteron, -# 10% on Core2 and PIII... -&function_begin("gcm_ghash_4bit"); - if (!$x86only) { &call (&label("pic_point")); &set_label("pic_point"); &blindpop("eax"); - &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); - &bt (&DWP(0,"ebp"),23); # check for MMX bit - &jnc (&label("x86")); - &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + &movz ($Zll,&BP(15,$inp)); + + &mmx_loop($inp,"eax"); + + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +# Streamed version performs 20% better on P4, 7% on Opteron, +# 10% on Core2 and PIII... +&function_begin("gcm_ghash_4bit_mmx"); &mov ($Zhh,&wparam(0)); # load Xi &mov ($Htbl,&wparam(1)); # load Htable &mov ($inp,&wparam(2)); # load in &mov ($Zlh,&wparam(3)); # load len + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + &add ($Zlh,$inp); &mov (&wparam(3),$Zlh); # len to point at the end of input &stack_push(4+1); # +1 for stack alignment + &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] &mov ($Zhl,&DWP(4,$Zhh)); &mov ($Zlh,&DWP(8,$Zhh)); &mov ($Zhh,&DWP(0,$Zhh)); + &jmp (&label("mmx_outer_loop")); &set_label("mmx_outer_loop",16); &xor ($Zll,&DWP(12,$inp)); @@ -355,83 +475,484 @@ if ($unroll) { &mov (&DWP(0,$inp),$Zhh); &stack_pop(4+1); - &function_end_A(); - &set_label("x86",16); - } - &stack_push(16+4+1); # +1 for 64-bit alignment - &mov ($Zll,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - &mov ($inp,&wparam(2)); # load in - &mov ("ecx",&wparam(3)); # load len - &add ("ecx",$inp); - &mov (&wparam(3),"ecx"); +&function_end("gcm_ghash_4bit_mmx"); + +if ($sse2) {{ +###################################################################### +# PCLMULQDQ version. + +$Xip="eax"; +$Htbl="edx"; +$const="ecx"; +$inp="esi"; +$len="ebx"; + +($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; +($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); +($Xn,$Xhn)=("xmm6","xmm7"); + +&static_label("bswap"); + +sub clmul64x64_T2 { # minimal "register" pressure +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($Xhi,$Xi); # + &pshufd ($T1,$Xi,0b01001110); + &pshufd ($T2,$Hkey,0b01001110); + &pxor ($T1,$Xi); # + &pxor ($T2,$Hkey); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T1,$T2,0x00); ####### + &pxor ($T1,$Xi); # + &pxor ($T1,$Xhi); # + + &movdqa ($T2,$T1); # + &psrldq ($T1,8); + &pslldq ($T2,8); # + &pxor ($Xhi,$T1); + &pxor ($Xi,$T2); # +} - &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] - &mov ($Zhl,&DWP(4,$Zll)); - &mov ($Zlh,&DWP(8,$Zll)); - &mov ($Zll,&DWP(12,$Zll)); +sub clmul64x64_T3 { +# Even though this subroutine offers visually better ILP, it +# was empirically found to be a tad slower than above version. +# At least in gcm_ghash_clmul context. But it's just as well, +# because loop modulo-scheduling is possible only thanks to +# minimized "register" pressure... +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($T1,$Xi); # + &movdqa ($Xhi,$Xi); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pshufd ($T2,$T1,0b01001110); # + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T2,$T1); # + &pxor ($T3,$Hkey); + &pclmulqdq ($T2,$T3,0x00); ####### + &pxor ($T2,$Xi); # + &pxor ($T2,$Xhi); # + + &movdqa ($T3,$T2); # + &psrldq ($T2,8); + &pslldq ($T3,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # +} + +if (1) { # Algorithm 9 with <<1 twist. + # Reduction is shorter and uses only two + # temporary registers, which makes it better + # candidate for interleaving with 64x64 + # multiplication. Pre-modulo-scheduled loop + # was found to be ~20% faster than Algorithm 5 + # below. Algorithm 9 was then chosen and + # optimized further... + +sub reduction_alg9 { # 17/13 times faster than Intel version +my ($Xhi,$Xi) = @_; + + # 1st phase + &movdqa ($T1,$Xi) # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,5); # + &pxor ($Xi,$T1); # + &psllq ($Xi,57); # + &movdqa ($T2,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T2,8); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); # + + # 2nd phase + &movdqa ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + &pxor ($T2,$Xhi); + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # +} - &deposit_rem_4bit(16); +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); - &set_label("x86_outer_loop",16); - &xor ($Zll,&DWP(12,$inp)); # xor with input - &xor ($Zlh,&DWP(8,$inp)); - &xor ($Zhl,&DWP(4,$inp)); - &xor ($Zhh,&DWP(0,$inp)); - &mov (&DWP(12,"esp"),$Zll); # dump it on stack - &mov (&DWP(8,"esp"),$Zlh); - &mov (&DWP(4,"esp"),$Zhl); - &mov (&DWP(0,"esp"),$Zhh); + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); - &shr ($Zll,20); - &and ($Zll,0xf0); + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap - if ($unroll) { - &call ("_x86_gmult_4bit_inner"); - } else { - &x86_loop(0); - &mov ($inp,&wparam(2)); - } - &lea ($inp,&DWP(16,$inp)); - &cmp ($inp,&wparam(3)); - &mov (&wparam(2),$inp) if (!$unroll); - &jb (&label("x86_outer_loop")); + # <<1 twist + &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword + &movdqa ($T1,$Hkey); + &psllq ($Hkey,1); + &pxor ($T3,$T3); # + &psrlq ($T1,63); + &pcmpgtd ($T3,$T2); # broadcast carry bit + &pslldq ($T1,8); + &por ($Hkey,$T1); # H<<=1 - &mov ($inp,&wparam(0)); # load Xi - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(0,$inp),$Zhh); - &stack_pop(16+4+1); -&function_end("gcm_ghash_4bit"); + # magic reduction + &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial + &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial -sub deposit_rem_4bit { - my $bias = shift; + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); - &mov (&DWP($bias+0, "esp"),0x0000<<16); - &mov (&DWP($bias+4, "esp"),0x1C20<<16); - &mov (&DWP($bias+8, "esp"),0x3840<<16); - &mov (&DWP($bias+12,"esp"),0x2460<<16); - &mov (&DWP($bias+16,"esp"),0x7080<<16); - &mov (&DWP($bias+20,"esp"),0x6CA0<<16); - &mov (&DWP($bias+24,"esp"),0x48C0<<16); - &mov (&DWP($bias+28,"esp"),0x54E0<<16); - &mov (&DWP($bias+32,"esp"),0xE100<<16); - &mov (&DWP($bias+36,"esp"),0xFD20<<16); - &mov (&DWP($bias+40,"esp"),0xD940<<16); - &mov (&DWP($bias+44,"esp"),0xC560<<16); - &mov (&DWP($bias+48,"esp"),0x9180<<16); - &mov (&DWP($bias+52,"esp"),0x8DA0<<16); - &mov (&DWP($bias+56,"esp"),0xA9C0<<16); - &mov (&DWP($bias+60,"esp"),0xB5E0<<16); + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &lea ($inp,&DWP(32,$inp)); # i+=2 + &sub ($len,0x20); + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + + &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($Xhi,$T1); # "Ii+Xi", consume early + + &movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,5); # + &pxor ($Xi,$T1); # + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &psllq ($Xi,57); # + &movdqa ($T2,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T2,8); # + &pxor ($Xi,$T1); + &pshufd ($T1,$T3,0b01001110); + &pxor ($Xhi,$T2); # + &pxor ($T1,$T3); + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T3,$Hkey); # + + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + &pxor ($T2,$Xhi); + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + + &pclmulqdq ($T1,$T3,0x00); ####### + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + &pxor ($T1,$Xn); # + &pxor ($T1,$Xhn); # + + &movdqa ($T3,$T1); # + &psrldq ($T1,8); + &pslldq ($T3,8); # + &pxor ($Xhn,$T1); + &pxor ($Xn,$T3); # + &movdqa ($T3,&QWP(0,$const)); + + &lea ($inp,&DWP(32,$inp)); + &sub ($len,0x20); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg9 ($Xhi,$Xi); + + &test ($len,$len); + &jnz (&label("done")); + + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); + +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} else { # Algorith 5. Kept for reference purposes. + +sub reduction_alg5 { # 19/16 times faster than Intel version +my ($Xhi,$Xi)=@_; + + # <<1 + &movdqa ($T1,$Xi); # + &movdqa ($T2,$Xhi); + &pslld ($Xi,1); + &pslld ($Xhi,1); # + &psrld ($T1,31); + &psrld ($T2,31); # + &movdqa ($T3,$T1); + &pslldq ($T1,4); + &psrldq ($T3,12); # + &pslldq ($T2,4); + &por ($Xhi,$T3); # + &por ($Xi,$T1); + &por ($Xhi,$T2); # + + # 1st phase + &movdqa ($T1,$Xi); + &movdqa ($T2,$Xi); + &movdqa ($T3,$Xi); # + &pslld ($T1,31); + &pslld ($T2,30); + &pslld ($Xi,25); # + &pxor ($T1,$T2); + &pxor ($T1,$Xi); # + &movdqa ($T2,$T1); # + &pslldq ($T1,12); + &psrldq ($T2,4); # + &pxor ($T3,$T1); + + # 2nd phase + &pxor ($Xhi,$T3); # + &movdqa ($Xi,$T3); + &movdqa ($T1,$T3); + &psrld ($Xi,1); # + &psrld ($T1,2); + &psrld ($T3,7); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # + &pxor ($Xi,$Xhi); # } -if (!$x86only) { +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($Xn,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$Xn); + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &pshufb ($Xi,$Xn); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); # i+=2 + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + ####### + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); + &test ($len,$len); + &jnz (&label("done")); + + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} + +&set_label("bswap",64); + &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial +}} # $sse2 + &set_label("rem_4bit",64); &data_word(0,0x0000<<16,0,0x1C20<<16,0,0x3840<<16,0,0x2460<<16); &data_word(0,0x7080<<16,0,0x6CA0<<16,0,0x48C0<<16,0,0x54E0<<16); &data_word(0,0xE100<<16,0,0xFD20<<16,0,0xD940<<16,0,0xC560<<16); &data_word(0,0x9180<<16,0,0x8DA0<<16,0,0xA9C0<<16,0,0xB5E0<<16); -} +}}} # !$x86only + &asciz("GHASH for x86, CRYPTOGAMS by "); &asm_finish(); diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 25005b9062d44dc0c906011372d8d91e9a34168d..7811a98874cd915ef2d4a5f224b776fccb85bd9b 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -20,6 +20,12 @@ # Opteron 18.5 10.2 +80% # Core2 17.5 11.0 +59% +# May 2010 +# +# Add PCLMULQDQ version performing at 2.07 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -51,7 +57,7 @@ $rem="%rdx"; sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r =~ s/%[er]([sd]i)/%\1l/; $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } - + { my $N; sub loop() { my $inp = shift; @@ -156,8 +162,7 @@ $code.=<<___; ret .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ - - + # per-function register layout $inp="%rdx"; $len="%rcx"; @@ -203,9 +208,295 @@ $code.=<<___; .Lghash_epilogue: ret .size gcm_ghash_4bit,.-gcm_ghash_4bit +___ + +###################################################################### +# PCLMULQDQ version. + +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; +($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); + +sub clmul64x64_T2 { # minimal register pressure +my ($Xhi,$Xi,$Hkey,$modulo)=@_; +$code.=<<___ if (!defined($modulo)); + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey,$T2 + pxor $Xi,$T1 # + pxor $Hkey,$T2 +___ +$code.=<<___; + pclmulqdq \$0x00,$Hkey,$Xi ####### + pclmulqdq \$0x11,$Hkey,$Xhi ####### + pclmulqdq \$0x00,$T2,$T1 ####### + pxor $Xi,$T1 # + pxor $Xhi,$T1 # + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ +} + +sub reduction_alg9 { # 17/13 times faster than Intel version +my ($Xhi,$Xi) = @_; + +$code.=<<___; + # 1st phase + movdqa $Xi,$T1 # + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$5,$Xi # + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T2 # + pslldq \$8,$Xi + psrldq \$8,$T2 # + pxor $T1,$Xi + pxor $T2,$Xhi # + + # 2nd phase + movdqa $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $T2,$Xi # + pxor $Xhi,$T2 + psrlq \$1,$Xi # + pxor $T2,$Xi # +___ +} + +{ my ($Htbl,$Xip)=@_4args; + +$code.=<<___; +.globl gcm_init_clmul +.type gcm_init_clmul,\@abi-omnipotent +.align 16 +gcm_init_clmul: + movdqu ($Xip),$Hkey + pshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + movdqa $Hkey,$T1 + psllq \$1,$Hkey + pxor $T3,$T3 # + psrlq \$63,$T1 + pcmpgtd $T2,$T3 # broadcast carry bit + pslldq \$8,$T1 + por $T1,$Hkey # H<<=1 + + # magic reduction + pand .L0x1c2_polynomial(%rip),$T3 + pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + movdqa $Hkey,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqu $Hkey,($Htbl) # save H + movdqu $Xi,16($Htbl) # save H^2 + ret +.size gcm_init_clmul,.-gcm_init_clmul +___ +} + +{ my ($Xip,$Htbl)=@_4args; + +$code.=<<___; +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,\@abi-omnipotent +.align 16 +gcm_gmult_clmul: + movdqu ($Xip),$Xi + movdqa .Lbswap_mask(%rip),$T3 + movdqu ($Htbl),$Hkey + pshufb $T3,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufb $T3,$Xi + movdqu $Xi,($Xip) + ret +.size gcm_gmult_clmul,.-gcm_gmult_clmul +___ +} + +{ my ($Xip,$Htbl,$inp,$len)=@_4args; + my $Xn="%xmm6"; + my $Xhn="%xmm7"; + my $Hkey2="%xmm8"; + my $T1n="%xmm9"; + my $T2n="%xmm10"; + +$code.=<<___; +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,\@abi-omnipotent +.align 16 +gcm_ghash_clmul: +___ +$code.=<<___ if ($win64); +.LSEH_begin_gcm_ghash_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) + .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp) + .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp) + .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp) +___ +$code.=<<___; + movdqa .Lbswap_mask(%rip),$T3 + + movdqu ($Xip),$Xi + movdqu ($Htbl),$Hkey + pshufb $T3,$Xi + + sub \$0x10,$len + jz .Lodd_tail + + movdqu 16($Htbl),$Hkey2 + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + movdqu ($inp),$T1 # Ii + movdqu 16($inp),$Xn # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xn + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey2,$T2 + pxor $Xi,$T1 # + pxor $Hkey2,$T2 + + lea 32($inp),$inp # i+=2 + sub \$0x20,$len + jbe .Leven_tail + +.Lmod_loop: +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) +$code.=<<___; + movdqu ($inp),$T1 # Ii + pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + + movdqu 16($inp),$Xn # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xn + + movdqa $Xn,$Xhn # + pshufd \$0b01001110,$Xn,$T1n + pshufd \$0b01001110,$Hkey,$T2n + pxor $Xn,$T1n # + pxor $Hkey,$T2n + pxor $T1,$Xhi # "Ii+Xi", consume early + + movdqa $Xi,$T1 # 1st phase + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$5,$Xi # + pxor $T1,$Xi # + pclmulqdq \$0x00,$Hkey,$Xn ####### + psllq \$57,$Xi # + movdqa $Xi,$T2 # + pslldq \$8,$Xi + psrldq \$8,$T2 # + pxor $T1,$Xi + pxor $T2,$Xhi # + + pclmulqdq \$0x11,$Hkey,$Xhn ####### + movdqa $Xi,$T2 # 2nd phase + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $T2,$Xi # + pxor $Xhi,$T2 + psrlq \$1,$Xi # + pxor $T2,$Xi # + + pclmulqdq \$0x00,$T2n,$T1n ####### + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey2,$T2 + pxor $Xi,$T1 # + pxor $Hkey2,$T2 + + pxor $Xn,$T1n # + pxor $Xhn,$T1n # + movdqa $T1n,$T2n # + psrldq \$8,$T1n + pslldq \$8,$T2n # + pxor $T1n,$Xhn + pxor $T2n,$Xn # + + lea 32($inp),$inp + sub \$0x20,$len + ja .Lmod_loop + +.Leven_tail: +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) +$code.=<<___; + pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi +___ + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + test $len,$len + jnz .Ldone + +.Lodd_tail: + movdqu ($inp),$T1 # Ii + pshufb $T3,$T1 + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; +.Ldone: + pshufb $T3,$Xi + movdqu $Xi,($Xip) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + add \$0x58,%rsp +___ +$code.=<<___; + ret +.LSEH_end_gcm_ghash_clmul: +.size gcm_ghash_clmul,.-gcm_ghash_clmul +___ +} + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .align 64 -.type rem_4bit,\@object +.type .Lrem_4bit,\@object .Lrem_4bit: .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` @@ -214,7 +505,7 @@ $code.=<<___; .asciz "GHASH for x86_64, CRYPTOGAMS by " .align 64 ___ - + # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { @@ -316,6 +607,10 @@ se_handler: .rva .LSEH_end_gcm_ghash_4bit .rva .LSEH_info_gcm_ghash_4bit + .rva .LSEH_begin_gcm_ghash_clmul + .rva .LSEH_end_gcm_ghash_clmul + .rva .LSEH_info_gcm_ghash_clmul + .section .xdata .align 8 .LSEH_info_gcm_gmult_4bit: @@ -326,9 +621,46 @@ se_handler: .byte 9,0,0,0 .rva se_handler .rva .Lghash_prologue,.Lghash_epilogue # HandlerData +.LSEH_info_gcm_ghash_clmul: + .byte 0x01,0x1f,0x0b,0x00 + .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 + .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58 ___ } + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + + if ($dst>=8 || $src>=8) { + $rex=0x40; + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex; + } +} + +sub pclmulqdq { + my $arg=shift; + my @opcode=(0x66); + + if ($arg=~/\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } + return "pclmulqdq\t".$arg; +} + $code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\bpclmulqdq\s+(\$.*%xmm[0-9]+).*$/pclmulqdq($1)/gem; print $code; diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c index 920f5257893d9c85fafd853147d1a0d796c67394..b21b6b3a1594718f1ab68d98689abe6abb4c2581 100644 --- a/crypto/modes/gcm128.c +++ b/crypto/modes/gcm128.c @@ -67,7 +67,20 @@ typedef struct { u64 hi,lo; } u128; #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) #endif -#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) +#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) +#define REDUCE1BIT(V) do { \ + if (sizeof(size_t)==8) { \ + u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^T; \ + } \ + else { \ + u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^((u64)T<<32); \ + } \ +} while(0) + #ifdef TABLE_BITS #undef TABLE_BITS #endif @@ -75,15 +88,14 @@ typedef struct { u64 hi,lo; } u128; * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should * never be set to 8. 8 is effectively reserved for testing purposes. * Under ideal conditions "8-bit" version should be twice as fast as - * "4-bit" one. But world is far from ideal. For gcc-generated x86 code, - * "8-bit" was observed to run only ~50% faster. On x86_64 observed - * improvement was ~75%, much closer to optimal, but the fact of - * deviation means that references to pre-computed tables end up on - * critical path and as tables are pretty big, 4KB per key+1KB shared, - * execution time is sensitive to cache timing. It's not actually - * proven, but 4-bit procedure is believed to provide adequate - * all-round performance... - */ + * "4-bit" one. For gcc-generated x86[_64] code, "8-bit" was observed to + * run ~75% faster, closer to 100% for commercial compilers... But the + * catch is that "8-bit" procedure consumes 16 times more memory, 4KB + * per indivudual key + 1KB shared, and as access to these tables end up + * on critical path, real-life execution time would be sensitive to + * cache timing. It's not actually proven, but "4-bit" procedure is + * believed to provide adequate all-round performance... + */ #define TABLE_BITS 4 #if TABLE_BITS==8 @@ -99,16 +111,7 @@ static void gcm_init_8bit(u128 Htable[256], u64 H[2]) V.lo = H[1]; for (Htable[128]=V, i=64; i>0; i>>=1) { - if (sizeof(size_t)==8) { - u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); - V.lo = (V.hi<<63)|(V.lo>>1); - V.hi = (V.hi>>1 )^T; - } - else { - u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); - V.lo = (V.hi<<63)|(V.lo>>1); - V.hi = (V.hi>>1 )^((u64)T<<32); - } + REDUCE1BIT(V); Htable[i] = V; } @@ -238,18 +241,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) #if defined(OPENSSL_SMALL_FOOTPRINT) int i; #endif -#define REDUCE(V) do { \ - if (sizeof(size_t)==8) { \ - u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ - V.lo = (V.hi<<63)|(V.lo>>1); \ - V.hi = (V.hi>>1 )^T; \ - } \ - else { \ - u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ - V.lo = (V.hi<<63)|(V.lo>>1); \ - V.hi = (V.hi>>1 )^((u64)T<<32); \ - } \ -} while(0) Htable[0].hi = 0; Htable[0].lo = 0; @@ -258,7 +249,7 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) #if defined(OPENSSL_SMALL_FOOTPRINT) for (Htable[8]=V, i=4; i>0; i>>=1) { - REDUCE(V); + REDUCE1BIT(V); Htable[i] = V; } @@ -272,11 +263,11 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) } #else Htable[8] = V; - REDUCE(V); + REDUCE1BIT(V); Htable[4] = V; - REDUCE(V); + REDUCE1BIT(V); Htable[2] = V; - REDUCE(V); + REDUCE1BIT(V); Htable[1] = V; Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; V=Htable[4]; @@ -314,7 +305,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) } } #endif -#undef REDUCE } #ifndef GHASH_ASM @@ -471,7 +461,7 @@ void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) -#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) +#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache * trashing effect. In other words idea is to hash data while it's * still in L1 cache after encryption pass... */ @@ -514,17 +504,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) Z.hi ^= V.hi&M; Z.lo ^= V.lo&M; - if (sizeof(size_t)==8) { - u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); - V.lo = (V.hi<<63)|(V.lo>>1); - V.hi = (V.hi>>1 )^T; - } - else { - u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); - V.lo = (V.hi<<63)|(V.lo>>1); - V.hi = (V.hi>>1 )^((u64)T<<32); - } - + REDUCE1BIT(V); } } @@ -559,12 +539,40 @@ struct gcm128_context { u128 Htable[256]; #else u128 Htable[16]; + void (*gmult)(u64 Xi[2],const u128 Htable[16]); + void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); #endif unsigned int res, pad; block128_f block; void *key; }; +#if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \ + (defined(__i386) || defined(__i386__) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) +# define GHASH_ASM_IAX +extern unsigned int OPENSSL_ia32cap_P[2]; + +void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); +void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); + +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) +# define GHASH_ASM_X86 +void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); + +void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +# endif + +# undef GCM_MUL +# define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable) +# undef GHASH +# define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len) +#endif + void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) { const union { long one; char little; } is_endian = {1}; @@ -593,7 +601,29 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) #if TABLE_BITS==8 gcm_init_8bit(ctx->Htable,ctx->H.u); #elif TABLE_BITS==4 +# if defined(GHASH_ASM_IAX) + if (OPENSSL_ia32cap_P[1]&(1<<1)) { + gcm_init_clmul(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_clmul; + ctx->ghash = gcm_ghash_clmul; + return; + } gcm_init_4bit(ctx->Htable,ctx->H.u); +# if defined(GHASH_ASM_X86) + if (OPENSSL_ia32cap_P[0]&(1<<23)) { + ctx->gmult = gcm_gmult_4bit_mmx; + ctx->ghash = gcm_ghash_4bit_mmx; + } else { + ctx->gmult = gcm_gmult_4bit_x86; + ctx->ghash = gcm_ghash_4bit_x86; + } +# else + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; +# endif +# else + gcm_init_4bit(ctx->Htable,ctx->H.u); +# endif #endif } @@ -671,7 +701,7 @@ void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) #ifdef GHASH if ((i = (len&(size_t)-16))) { - GHASH(aad,i,ctx); + GHASH(ctx,aad,i); aad += i; len -= i; } @@ -740,7 +770,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, in += 16; j -= 16; } - GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx); + GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); len -= GHASH_CHUNK; } if ((i = (len&(size_t)-16))) { @@ -760,7 +790,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, in += 16; len -= 16; } - GHASH(out-j,j,ctx); + GHASH(ctx,out-j,j); } #else while (len>=16) { @@ -854,7 +884,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, while (len>=GHASH_CHUNK) { size_t j=GHASH_CHUNK; - GHASH(in,GHASH_CHUNK,ctx); + GHASH(ctx,in,GHASH_CHUNK); while (j) { (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key); ++ctr; @@ -872,7 +902,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, len -= GHASH_CHUNK; } if ((i = (len&(size_t)-16))) { - GHASH(in,i,ctx); + GHASH(ctx,in,i); while (len>=16) { (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key); ++ctr; @@ -1243,6 +1273,7 @@ int main() { size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc(); union { u64 u; u8 c[1024]; } buf; + int i; AES_set_encrypt_key(K1,sizeof(K1)*8,&key); CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); @@ -1267,11 +1298,11 @@ int main() ctr_t/(double)sizeof(buf), (gcm_t-ctr_t)/(double)sizeof(buf)); #ifdef GHASH - GHASH(buf.c,sizeof(buf),&ctx); + GHASH(&ctx,buf.c,sizeof(buf)); start = OPENSSL_rdtsc(); - GHASH(buf.c,sizeof(buf),&ctx); + for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf)); gcm_t = OPENSSL_rdtsc() - start; - printf("%.2f\n",gcm_t/(double)sizeof(buf)); + printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i); #endif } #endif