提交 c1f092d1 编写于 作者: A Andy Polyakov

GCM "jumbo" update:

- gcm128.c: support for Intel PCLMULQDQ, readability improvements;
- asm/ghash-x86.pl: splitted vanilla, MMX, PCLMULQDQ subroutines;
- asm/ghash-x86_64.pl: add PCLMULQDQ implementations.
上级 ea7239cf
此差异已折叠。
...@@ -20,6 +20,12 @@ ...@@ -20,6 +20,12 @@
# Opteron 18.5 10.2 +80% # Opteron 18.5 10.2 +80%
# Core2 17.5 11.0 +59% # Core2 17.5 11.0 +59%
# May 2010
#
# Add PCLMULQDQ version performing at 2.07 cycles per processed byte.
# See ghash-x86.pl for background information and details about coding
# techniques.
$flavour = shift; $flavour = shift;
$output = shift; $output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
...@@ -51,7 +57,7 @@ $rem="%rdx"; ...@@ -51,7 +57,7 @@ $rem="%rdx";
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
$r =~ s/%[er]([sd]i)/%\1l/; $r =~ s/%[er]([sd]i)/%\1l/;
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
{ my $N; { my $N;
sub loop() { sub loop() {
my $inp = shift; my $inp = shift;
...@@ -156,8 +162,7 @@ $code.=<<___; ...@@ -156,8 +162,7 @@ $code.=<<___;
ret ret
.size gcm_gmult_4bit,.-gcm_gmult_4bit .size gcm_gmult_4bit,.-gcm_gmult_4bit
___ ___
# per-function register layout # per-function register layout
$inp="%rdx"; $inp="%rdx";
$len="%rcx"; $len="%rcx";
...@@ -203,9 +208,295 @@ $code.=<<___; ...@@ -203,9 +208,295 @@ $code.=<<___;
.Lghash_epilogue: .Lghash_epilogue:
ret ret
.size gcm_ghash_4bit,.-gcm_ghash_4bit .size gcm_ghash_4bit,.-gcm_ghash_4bit
___
######################################################################
# PCLMULQDQ version.
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order
($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
sub clmul64x64_T2 { # minimal register pressure
my ($Xhi,$Xi,$Hkey,$modulo)=@_;
$code.=<<___ if (!defined($modulo));
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey,$T2
pxor $Xi,$T1 #
pxor $Hkey,$T2
___
$code.=<<___;
pclmulqdq \$0x00,$Hkey,$Xi #######
pclmulqdq \$0x11,$Hkey,$Xhi #######
pclmulqdq \$0x00,$T2,$T1 #######
pxor $Xi,$T1 #
pxor $Xhi,$T1 #
movdqa $T1,$T2 #
psrldq \$8,$T1
pslldq \$8,$T2 #
pxor $T1,$Xhi
pxor $T2,$Xi #
___
}
sub reduction_alg9 { # 17/13 times faster than Intel version
my ($Xhi,$Xi) = @_;
$code.=<<___;
# 1st phase
movdqa $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$5,$Xi #
pxor $T1,$Xi #
psllq \$57,$Xi #
movdqa $Xi,$T2 #
pslldq \$8,$Xi
psrldq \$8,$T2 #
pxor $T1,$Xi
pxor $T2,$Xhi #
# 2nd phase
movdqa $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
pxor $T2,$Xi #
pxor $Xhi,$T2
psrlq \$1,$Xi #
pxor $T2,$Xi #
___
}
{ my ($Htbl,$Xip)=@_4args;
$code.=<<___;
.globl gcm_init_clmul
.type gcm_init_clmul,\@abi-omnipotent
.align 16
gcm_init_clmul:
movdqu ($Xip),$Hkey
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
# <<1 twist
pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
movdqa $Hkey,$T1
psllq \$1,$Hkey
pxor $T3,$T3 #
psrlq \$63,$T1
pcmpgtd $T2,$T3 # broadcast carry bit
pslldq \$8,$T1
por $T1,$Hkey # H<<=1
# magic reduction
pand .L0x1c2_polynomial(%rip),$T3
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
# calculate H^2
movdqa $Hkey,$Xi
___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
movdqu $Hkey,($Htbl) # save H
movdqu $Xi,16($Htbl) # save H^2
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
}
{ my ($Xip,$Htbl)=@_4args;
$code.=<<___;
.globl gcm_gmult_clmul
.type gcm_gmult_clmul,\@abi-omnipotent
.align 16
gcm_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Htbl),$Hkey
pshufb $T3,$Xi
___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
pshufb $T3,$Xi
movdqu $Xi,($Xip)
ret
.size gcm_gmult_clmul,.-gcm_gmult_clmul
___
}
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
my $Xn="%xmm6";
my $Xhn="%xmm7";
my $Hkey2="%xmm8";
my $T1n="%xmm9";
my $T2n="%xmm10";
$code.=<<___;
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,\@abi-omnipotent
.align 16
gcm_ghash_clmul:
___
$code.=<<___ if ($win64);
.LSEH_begin_gcm_ghash_clmul:
# I can't trust assembler to use specific encoding:-(
.byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
.byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
.byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
.byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
pshufb $T3,$Xi
sub \$0x10,$len
jz .Lodd_tail
movdqu 16($Htbl),$Hkey2
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
movdqu ($inp),$T1 # Ii
movdqu 16($inp),$Xn # Ii+1
pshufb $T3,$T1
pshufb $T3,$Xn
pxor $T1,$Xi # Ii+Xi
___
&clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
$code.=<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey2,$T2
pxor $Xi,$T1 #
pxor $Hkey2,$T2
lea 32($inp),$inp # i+=2
sub \$0x20,$len
jbe .Leven_tail
.Lmod_loop:
___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
$code.=<<___;
movdqu ($inp),$T1 # Ii
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
movdqu 16($inp),$Xn # Ii+1
pshufb $T3,$T1
pshufb $T3,$Xn
movdqa $Xn,$Xhn #
pshufd \$0b01001110,$Xn,$T1n
pshufd \$0b01001110,$Hkey,$T2n
pxor $Xn,$T1n #
pxor $Hkey,$T2n
pxor $T1,$Xhi # "Ii+Xi", consume early
movdqa $Xi,$T1 # 1st phase
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$5,$Xi #
pxor $T1,$Xi #
pclmulqdq \$0x00,$Hkey,$Xn #######
psllq \$57,$Xi #
movdqa $Xi,$T2 #
pslldq \$8,$Xi
psrldq \$8,$T2 #
pxor $T1,$Xi
pxor $T2,$Xhi #
pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
pxor $T2,$Xi #
pxor $Xhi,$T2
psrlq \$1,$Xi #
pxor $T2,$Xi #
pclmulqdq \$0x00,$T2n,$T1n #######
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey2,$T2
pxor $Xi,$T1 #
pxor $Hkey2,$T2
pxor $Xn,$T1n #
pxor $Xhn,$T1n #
movdqa $T1n,$T2n #
psrldq \$8,$T1n
pslldq \$8,$T2n #
pxor $T1n,$Xhn
pxor $T2n,$Xn #
lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
$code.=<<___;
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
test $len,$len
jnz .Ldone
.Lodd_tail:
movdqu ($inp),$T1 # Ii
pshufb $T3,$T1
pxor $T1,$Xi # Ii+Xi
___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
.Ldone:
pshufb $T3,$Xi
movdqu $Xi,($Xip)
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
movaps 0x10(%rsp),%xmm7
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
add \$0x58,%rsp
___
$code.=<<___;
ret
.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
}
$code.=<<___;
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
.align 64 .align 64
.type rem_4bit,\@object .type .Lrem_4bit,\@object
.Lrem_4bit: .Lrem_4bit:
.long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
.long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
...@@ -214,7 +505,7 @@ $code.=<<___; ...@@ -214,7 +505,7 @@ $code.=<<___;
.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64 .align 64
___ ___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp) # CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) { if ($win64) {
...@@ -316,6 +607,10 @@ se_handler: ...@@ -316,6 +607,10 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit .rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit .rva .LSEH_info_gcm_ghash_4bit
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
.section .xdata .section .xdata
.align 8 .align 8
.LSEH_info_gcm_gmult_4bit: .LSEH_info_gcm_gmult_4bit:
...@@ -326,9 +621,46 @@ se_handler: ...@@ -326,9 +621,46 @@ se_handler:
.byte 9,0,0,0 .byte 9,0,0,0
.rva se_handler .rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
.LSEH_info_gcm_ghash_clmul:
.byte 0x01,0x1f,0x0b,0x00
.byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
.byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
.byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
.byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
.byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
.byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
___ ___
} }
sub rex {
local *opcode=shift;
my ($dst,$src)=@_;
if ($dst>=8 || $src>=8) {
$rex=0x40;
$rex|=0x04 if($dst>=8);
$rex|=0x01 if($src>=8);
push @opcode,$rex;
}
}
sub pclmulqdq {
my $arg=shift;
my @opcode=(0x66);
if ($arg=~/\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
rex(\@opcode,$3,$2);
push @opcode,0x0f,0x3a,0x44;
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
my $c=$1;
push @opcode,$c=~/^0/?oct($c):$c;
return ".byte\t".join(',',@opcode);
}
return "pclmulqdq\t".$arg;
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\bpclmulqdq\s+(\$.*%xmm[0-9]+).*$/pclmulqdq($1)/gem;
print $code; print $code;
......
...@@ -67,7 +67,20 @@ typedef struct { u64 hi,lo; } u128; ...@@ -67,7 +67,20 @@ typedef struct { u64 hi,lo; } u128;
#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
#endif #endif
#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
#define REDUCE1BIT(V) do { \
if (sizeof(size_t)==8) { \
u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
V.lo = (V.hi<<63)|(V.lo>>1); \
V.hi = (V.hi>>1 )^T; \
} \
else { \
u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
V.lo = (V.hi<<63)|(V.lo>>1); \
V.hi = (V.hi>>1 )^((u64)T<<32); \
} \
} while(0)
#ifdef TABLE_BITS #ifdef TABLE_BITS
#undef TABLE_BITS #undef TABLE_BITS
#endif #endif
...@@ -75,15 +88,14 @@ typedef struct { u64 hi,lo; } u128; ...@@ -75,15 +88,14 @@ typedef struct { u64 hi,lo; } u128;
* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
* never be set to 8. 8 is effectively reserved for testing purposes. * never be set to 8. 8 is effectively reserved for testing purposes.
* Under ideal conditions "8-bit" version should be twice as fast as * Under ideal conditions "8-bit" version should be twice as fast as
* "4-bit" one. But world is far from ideal. For gcc-generated x86 code, * "4-bit" one. For gcc-generated x86[_64] code, "8-bit" was observed to
* "8-bit" was observed to run only ~50% faster. On x86_64 observed * run ~75% faster, closer to 100% for commercial compilers... But the
* improvement was ~75%, much closer to optimal, but the fact of * catch is that "8-bit" procedure consumes 16 times more memory, 4KB
* deviation means that references to pre-computed tables end up on * per indivudual key + 1KB shared, and as access to these tables end up
* critical path and as tables are pretty big, 4KB per key+1KB shared, * on critical path, real-life execution time would be sensitive to
* execution time is sensitive to cache timing. It's not actually * cache timing. It's not actually proven, but "4-bit" procedure is
* proven, but 4-bit procedure is believed to provide adequate * believed to provide adequate all-round performance...
* all-round performance... */
*/
#define TABLE_BITS 4 #define TABLE_BITS 4
#if TABLE_BITS==8 #if TABLE_BITS==8
...@@ -99,16 +111,7 @@ static void gcm_init_8bit(u128 Htable[256], u64 H[2]) ...@@ -99,16 +111,7 @@ static void gcm_init_8bit(u128 Htable[256], u64 H[2])
V.lo = H[1]; V.lo = H[1];
for (Htable[128]=V, i=64; i>0; i>>=1) { for (Htable[128]=V, i=64; i>0; i>>=1) {
if (sizeof(size_t)==8) { REDUCE1BIT(V);
u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
V.lo = (V.hi<<63)|(V.lo>>1);
V.hi = (V.hi>>1 )^T;
}
else {
u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
V.lo = (V.hi<<63)|(V.lo>>1);
V.hi = (V.hi>>1 )^((u64)T<<32);
}
Htable[i] = V; Htable[i] = V;
} }
...@@ -238,18 +241,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) ...@@ -238,18 +241,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
#if defined(OPENSSL_SMALL_FOOTPRINT) #if defined(OPENSSL_SMALL_FOOTPRINT)
int i; int i;
#endif #endif
#define REDUCE(V) do { \
if (sizeof(size_t)==8) { \
u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
V.lo = (V.hi<<63)|(V.lo>>1); \
V.hi = (V.hi>>1 )^T; \
} \
else { \
u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
V.lo = (V.hi<<63)|(V.lo>>1); \
V.hi = (V.hi>>1 )^((u64)T<<32); \
} \
} while(0)
Htable[0].hi = 0; Htable[0].hi = 0;
Htable[0].lo = 0; Htable[0].lo = 0;
...@@ -258,7 +249,7 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) ...@@ -258,7 +249,7 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
#if defined(OPENSSL_SMALL_FOOTPRINT) #if defined(OPENSSL_SMALL_FOOTPRINT)
for (Htable[8]=V, i=4; i>0; i>>=1) { for (Htable[8]=V, i=4; i>0; i>>=1) {
REDUCE(V); REDUCE1BIT(V);
Htable[i] = V; Htable[i] = V;
} }
...@@ -272,11 +263,11 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) ...@@ -272,11 +263,11 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
} }
#else #else
Htable[8] = V; Htable[8] = V;
REDUCE(V); REDUCE1BIT(V);
Htable[4] = V; Htable[4] = V;
REDUCE(V); REDUCE1BIT(V);
Htable[2] = V; Htable[2] = V;
REDUCE(V); REDUCE1BIT(V);
Htable[1] = V; Htable[1] = V;
Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo; Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
V=Htable[4]; V=Htable[4];
...@@ -314,7 +305,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2]) ...@@ -314,7 +305,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
} }
} }
#endif #endif
#undef REDUCE
} }
#ifndef GHASH_ASM #ifndef GHASH_ASM
...@@ -471,7 +461,7 @@ void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); ...@@ -471,7 +461,7 @@ void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
* trashing effect. In other words idea is to hash data while it's * trashing effect. In other words idea is to hash data while it's
* still in L1 cache after encryption pass... */ * still in L1 cache after encryption pass... */
...@@ -514,17 +504,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) ...@@ -514,17 +504,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
Z.hi ^= V.hi&M; Z.hi ^= V.hi&M;
Z.lo ^= V.lo&M; Z.lo ^= V.lo&M;
if (sizeof(size_t)==8) { REDUCE1BIT(V);
u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
V.lo = (V.hi<<63)|(V.lo>>1);
V.hi = (V.hi>>1 )^T;
}
else {
u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
V.lo = (V.hi<<63)|(V.lo>>1);
V.hi = (V.hi>>1 )^((u64)T<<32);
}
} }
} }
...@@ -559,12 +539,40 @@ struct gcm128_context { ...@@ -559,12 +539,40 @@ struct gcm128_context {
u128 Htable[256]; u128 Htable[256];
#else #else
u128 Htable[16]; u128 Htable[16];
void (*gmult)(u64 Xi[2],const u128 Htable[16]);
void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#endif #endif
unsigned int res, pad; unsigned int res, pad;
block128_f block; block128_f block;
void *key; void *key;
}; };
#if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
(defined(__i386) || defined(__i386__) || \
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
# define GHASH_ASM_IAX
extern unsigned int OPENSSL_ia32cap_P[2];
void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define GHASH_ASM_X86
void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
# undef GCM_MUL
# define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
# undef GHASH
# define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
#endif
void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
{ {
const union { long one; char little; } is_endian = {1}; const union { long one; char little; } is_endian = {1};
...@@ -593,7 +601,29 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) ...@@ -593,7 +601,29 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
#if TABLE_BITS==8 #if TABLE_BITS==8
gcm_init_8bit(ctx->Htable,ctx->H.u); gcm_init_8bit(ctx->Htable,ctx->H.u);
#elif TABLE_BITS==4 #elif TABLE_BITS==4
# if defined(GHASH_ASM_IAX)
if (OPENSSL_ia32cap_P[1]&(1<<1)) {
gcm_init_clmul(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_clmul;
ctx->ghash = gcm_ghash_clmul;
return;
}
gcm_init_4bit(ctx->Htable,ctx->H.u); gcm_init_4bit(ctx->Htable,ctx->H.u);
# if defined(GHASH_ASM_X86)
if (OPENSSL_ia32cap_P[0]&(1<<23)) {
ctx->gmult = gcm_gmult_4bit_mmx;
ctx->ghash = gcm_ghash_4bit_mmx;
} else {
ctx->gmult = gcm_gmult_4bit_x86;
ctx->ghash = gcm_ghash_4bit_x86;
}
# else
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
# endif
# else
gcm_init_4bit(ctx->Htable,ctx->H.u);
# endif
#endif #endif
} }
...@@ -671,7 +701,7 @@ void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) ...@@ -671,7 +701,7 @@ void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
#ifdef GHASH #ifdef GHASH
if ((i = (len&(size_t)-16))) { if ((i = (len&(size_t)-16))) {
GHASH(aad,i,ctx); GHASH(ctx,aad,i);
aad += i; aad += i;
len -= i; len -= i;
} }
...@@ -740,7 +770,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, ...@@ -740,7 +770,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
in += 16; in += 16;
j -= 16; j -= 16;
} }
GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx); GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
len -= GHASH_CHUNK; len -= GHASH_CHUNK;
} }
if ((i = (len&(size_t)-16))) { if ((i = (len&(size_t)-16))) {
...@@ -760,7 +790,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, ...@@ -760,7 +790,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
in += 16; in += 16;
len -= 16; len -= 16;
} }
GHASH(out-j,j,ctx); GHASH(ctx,out-j,j);
} }
#else #else
while (len>=16) { while (len>=16) {
...@@ -854,7 +884,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, ...@@ -854,7 +884,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
while (len>=GHASH_CHUNK) { while (len>=GHASH_CHUNK) {
size_t j=GHASH_CHUNK; size_t j=GHASH_CHUNK;
GHASH(in,GHASH_CHUNK,ctx); GHASH(ctx,in,GHASH_CHUNK);
while (j) { while (j) {
(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key); (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
++ctr; ++ctr;
...@@ -872,7 +902,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, ...@@ -872,7 +902,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
len -= GHASH_CHUNK; len -= GHASH_CHUNK;
} }
if ((i = (len&(size_t)-16))) { if ((i = (len&(size_t)-16))) {
GHASH(in,i,ctx); GHASH(ctx,in,i);
while (len>=16) { while (len>=16) {
(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key); (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
++ctr; ++ctr;
...@@ -1243,6 +1273,7 @@ int main() ...@@ -1243,6 +1273,7 @@ int main()
{ {
size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc(); size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
union { u64 u; u8 c[1024]; } buf; union { u64 u; u8 c[1024]; } buf;
int i;
AES_set_encrypt_key(K1,sizeof(K1)*8,&key); AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
...@@ -1267,11 +1298,11 @@ int main() ...@@ -1267,11 +1298,11 @@ int main()
ctr_t/(double)sizeof(buf), ctr_t/(double)sizeof(buf),
(gcm_t-ctr_t)/(double)sizeof(buf)); (gcm_t-ctr_t)/(double)sizeof(buf));
#ifdef GHASH #ifdef GHASH
GHASH(buf.c,sizeof(buf),&ctx); GHASH(&ctx,buf.c,sizeof(buf));
start = OPENSSL_rdtsc(); start = OPENSSL_rdtsc();
GHASH(buf.c,sizeof(buf),&ctx); for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
gcm_t = OPENSSL_rdtsc() - start; gcm_t = OPENSSL_rdtsc() - start;
printf("%.2f\n",gcm_t/(double)sizeof(buf)); printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
#endif #endif
} }
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册