GCM "jumbo" update:

- gcm128.c: support for Intel PCLMULQDQ, readability improvements; - asm/ghash-x86.pl: splitted vanilla, MMX, PCLMULQDQ subroutines; - asm/ghash-x86_64.pl: add PCLMULQDQ implementations.

GCM "jumbo" update:
- gcm128.c: support for Intel PCLMULQDQ, readability improvements; - asm/ghash-x86.pl: splitted vanilla, MMX, PCLMULQDQ subroutines; - asm/ghash-x86_64.pl: add PCLMULQDQ implementations.
c1f092d1 · Andy Polyakov · ea7239cf · c1f092d1 · c1f092d1 · c1f092d1
Showing with 1149 addition and 265 deletion

crypto/modes/asm/ghash-x86.pl crypto/modes/asm/ghash-x86.pl +724 -203

crypto/modes/asm/ghash-x86_64.pl crypto/modes/asm/ghash-x86_64.pl +337 -5

crypto/modes/gcm128.c crypto/modes/gcm128.c +88 -57

未找到文件。
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -20,6 +20,12 @@
 # Opteron	18.5		10.2		+80%
 # Core2		17.5		11.0		+59%
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.07 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -51,7 +57,7 @@ $rem="%rdx";
 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
 			$r =~ s/%[er]([sd]i)/%\1l/;
 			$r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
 { my $N;
  sub loop() {
  my $inp = shift;
@@ -156,8 +162,7 @@ $code.=<<___;
 	ret
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
 ___
 # per-function register layout
 $inp="%rdx";
 $len="%rcx";
@@ -203,9 +208,295 @@ $code.=<<___;
 .Lghash_epilogue:
 	ret
 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+######################################################################
+# PCLMULQDQ version.
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+sub clmul64x64_T2 {	# minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+$code.=<<___ if (!defined($modulo));
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey,$T2
+___
+$code.=<<___;
+	pclmulqdq	\$0x00,$Hkey,$Xi	#######
+	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
+	pclmulqdq	\$0x00,$T2,$T1		#######
+	pxor		$Xi,$T1			#
+	pxor		$Xhi,$T1		#
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+}
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+$code.=<<___;
+	# 1st phase
+	movdqa		$Xi,$T1			#
+	psllq		\$1,$Xi
+	pxor		$T1,$Xi			#
+	psllq		\$5,$Xi			#
+	pxor		$T1,$Xi			#
+	psllq		\$57,$Xi		#
+	movdqa		$Xi,$T2			#
+	pslldq		\$8,$Xi
+	psrldq		\$8,$T2			#	
+	pxor		$T1,$Xi
+	pxor		$T2,$Xhi		#
+	# 2nd phase
+	movdqa		$Xi,$T2
+	psrlq		\$5,$Xi
+	pxor		$T2,$Xi			#
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+	pxor		$Xhi,$T2
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+___
+}
+{ my ($Htbl,$Xip)=@_4args;
+$code.=<<___;
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,\@abi-omnipotent
+.align	16
+gcm_init_clmul:
+	movdqu		($Xip),$Hkey
+	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+	# <<1 twist
+	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	movdqa		$Hkey,$T1
+	psllq		\$1,$Hkey
+	pxor		$T3,$T3			#
+	psrlq		\$63,$T1
+	pcmpgtd		$T2,$T3			# broadcast carry bit
+	pslldq		\$8,$T1
+	por		$T1,$Hkey		# H<<=1
+	# magic reduction
+	pand		.L0x1c2_polynomial(%rip),$T3
+	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
+	# calculate H^2
+	movdqa		$Hkey,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqu		$Hkey,($Htbl)		# save H
+	movdqu		$Xi,16($Htbl)		# save H^2
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+}
+{ my ($Xip,$Htbl)=@_4args;
+$code.=<<___;
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,\@abi-omnipotent
+.align	16
+gcm_gmult_clmul:
+	movdqu		($Xip),$Xi
+	movdqa		.Lbswap_mask(%rip),$T3
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+$code.=<<___;
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,\@abi-omnipotent
+.align	16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
+	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
+	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+___
+$code.=<<___;
+	movdqa		.Lbswap_mask(%rip),$T3
+	movdqu		($Xip),$Xi
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+	sub		\$0x10,$len
+	jz		.Lodd_tail
+	movdqu		16($Htbl),$Hkey2
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	movdqu		($inp),$T1		# Ii
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey2,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey2,$T2
+	lea		32($inp),$inp		# i+=2
+	sub		\$0x20,$len
+	jbe		.Leven_tail
+.Lmod_loop:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	movdqu		($inp),$T1		# Ii
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+	movdqa		$Xn,$Xhn		#
+	pshufd		\$0b01001110,$Xn,$T1n
+	pshufd		\$0b01001110,$Hkey,$T2n
+	pxor		$Xn,$T1n		#
+	pxor		$Hkey,$T2n
+	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+	  movdqa	$Xi,$T1			# 1st phase
+	  psllq		\$1,$Xi
+	  pxor		$T1,$Xi			#
+	  psllq		\$5,$Xi			#
+	  pxor		$T1,$Xi			#
+	pclmulqdq	\$0x00,$Hkey,$Xn	#######
+	  psllq		\$57,$Xi		#
+	  movdqa	$Xi,$T2			#
+	  pslldq	\$8,$Xi
+	  psrldq	\$8,$T2			#	
+	  pxor		$T1,$Xi
+	  pxor		$T2,$Xhi		#
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$5,$Xi
+	  pxor		$T2,$Xi			#
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+	  pxor		$Xhi,$T2
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+	pclmulqdq	\$0x00,$T2n,$T1n	#######
+	 movdqa		$Xi,$Xhi		#
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pshufd		\$0b01001110,$Hkey2,$T2
+	 pxor		$Xi,$T1			#
+	 pxor		$Hkey2,$T2
+	pxor		$Xn,$T1n		#
+	pxor		$Xhn,$T1n		#
+	movdqa		$T1n,$T2n		#
+	psrldq		\$8,$T1n
+	pslldq		\$8,$T2n		#
+	pxor		$T1n,$Xhn
+	pxor		$T2n,$Xn		#
+	lea		32($inp),$inp
+	sub		\$0x20,$len
+	ja		.Lmod_loop
+.Leven_tail:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+___
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	test		$len,$len
+	jnz		.Ldone
+.Lodd_tail:
+	movdqu		($inp),$T1		# Ii
+	pshufb		$T3,$T1
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	add	\$0x58,%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
 .align	64
-.type	rem_4bit,\@object
+.type	.Lrem_4bit,\@object
 .Lrem_4bit:
 	.long	0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
 	.long	0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
@@ -214,7 +505,7 @@ $code.=<<___;
 .asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 ___
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
 if ($win64) {
@@ -316,6 +607,10 @@ se_handler:
 	.rva	.LSEH_end_gcm_ghash_4bit
 	.rva	.LSEH_info_gcm_ghash_4bit
+	.rva	.LSEH_begin_gcm_ghash_clmul
+	.rva	.LSEH_end_gcm_ghash_clmul
+	.rva	.LSEH_info_gcm_ghash_clmul
 .section	.xdata
 .align	8
 .LSEH_info_gcm_gmult_4bit:
@@ -326,9 +621,46 @@ se_handler:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_clmul:
+	.byte	0x01,0x1f,0x0b,0x00
+	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
+	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
 ___
 }
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+   if ($dst>=8 || $src>=8) {
+	$rex=0x40;
+	$rex|=0x04 if($dst>=8);
+	$rex|=0x01 if($src>=8);
+	push @opcode,$rex;
+   }
+}
+sub pclmulqdq {
+  my $arg=shift;
+  my @opcode=(0x66);
+    if ($arg=~/\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	return ".byte\t".join(',',@opcode);
+    }
+    return "pclmulqdq\t".$arg;
+}
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\bpclmulqdq\s+(\$.*%xmm[0-9]+).*$/pclmulqdq($1)/gem;
 print $code;

--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -67,7 +67,20 @@ typedef struct { u64 hi,lo; } u128;
 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
 #endif
-#define	PACK(s)	((size_t)(s)<<(sizeof(size_t)*8-16))
+#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V)	do { \
+	if (sizeof(size_t)==8) { \
+		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^T; \
+	} \
+	else { \
+		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^((u64)T<<32); \
+	} \
+} while(0)
 #ifdef	TABLE_BITS
 #undef	TABLE_BITS
 #endif
@@ -75,15 +88,14 @@ typedef struct { u64 hi,lo; } u128;
 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
 * never be set to 8. 8 is effectively reserved for testing purposes.
 * Under ideal conditions "8-bit" version should be twice as fast as
- * "4-bit" one. But world is far from ideal. For gcc-generated x86 code,
+ * "4-bit" one. For gcc-generated x86[_64] code, "8-bit" was observed to
- * "8-bit" was observed to run only ~50% faster. On x86_64 observed
+ * run ~75% faster, closer to 100% for commercial compilers... But the
- * improvement was ~75%, much closer to optimal, but the fact of
+ * catch is that "8-bit" procedure consumes 16 times more memory, 4KB
- * deviation means that references to pre-computed tables end up on
+ * per indivudual key + 1KB shared, and as access to these tables end up
- * critical path and as tables are pretty big, 4KB per key+1KB shared,
+ * on critical path, real-life execution time would be sensitive to
- * execution time is sensitive to cache timing. It's not actually
+ * cache timing. It's not actually proven, but "4-bit" procedure is
- * proven, but 4-bit procedure is believed to provide adequate
+ * believed to provide adequate all-round performance...
- * all-round performance...
+ */
- */  
 #define	TABLE_BITS 4
 #if	TABLE_BITS==8
@@ -99,16 +111,7 @@ static void gcm_init_8bit(u128 Htable[256], u64 H[2])
 	V.lo = H[1];
 	for (Htable[128]=V, i=64; i>0; i>>=1) {
-		if (sizeof(size_t)==8) {
+		REDUCE1BIT(V);
-			u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
-			V.lo  = (V.hi<<63)|(V.lo>>1);
-			V.hi  = (V.hi>>1 )^T;
-		}
-		else {
-			u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
-			V.lo  = (V.hi<<63)|(V.lo>>1);
-			V.hi  = (V.hi>>1 )^((u64)T<<32);
-		}
 		Htable[i] = V;
 	}
@@ -238,18 +241,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 #if defined(OPENSSL_SMALL_FOOTPRINT)
 	int  i;
 #endif
-#define REDUCE(V) do { \
-	if (sizeof(size_t)==8) { \
-		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
-		V.lo  = (V.hi<<63)|(V.lo>>1); \
-		V.hi  = (V.hi>>1 )^T; \
-	} \
-	else { \
-		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
-		V.lo  = (V.hi<<63)|(V.lo>>1); \
-		V.hi  = (V.hi>>1 )^((u64)T<<32); \
-	} \
-} while(0)
 	Htable[0].hi = 0;
 	Htable[0].lo = 0;
@@ -258,7 +249,7 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 #if defined(OPENSSL_SMALL_FOOTPRINT)
 	for (Htable[8]=V, i=4; i>0; i>>=1) {
-		REDUCE(V);
+		REDUCE1BIT(V);
 		Htable[i] = V;
 	}
@@ -272,11 +263,11 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 	}
 #else
 	Htable[8] = V;
-	REDUCE(V);
+	REDUCE1BIT(V);
 	Htable[4] = V;
-	REDUCE(V);
+	REDUCE1BIT(V);
 	Htable[2] = V;
-	REDUCE(V);
+	REDUCE1BIT(V);
 	Htable[1] = V;
 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
 	V=Htable[4];
@@ -314,7 +305,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 		}
 	}
 #endif
-#undef	REDUCE
 }
 #ifndef GHASH_ASM
@@ -471,7 +461,7 @@ void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
 * trashing effect. In other words idea is to hash data while it's
 * still in L1 cache after encryption pass... */
@@ -514,17 +504,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
 			Z.hi ^= V.hi&M;
 			Z.lo ^= V.lo&M;
-			if (sizeof(size_t)==8) {
+			REDUCE1BIT(V);
-				u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
-				V.lo  = (V.hi<<63)|(V.lo>>1);
-				V.hi  = (V.hi>>1 )^T;
-			}
-			else {
-				u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
-				V.lo  = (V.hi<<63)|(V.lo>>1);
-				V.hi  = (V.hi>>1 )^((u64)T<<32);
-			}
 		}
 	}
@@ -559,12 +539,40 @@ struct gcm128_context {
 	u128 Htable[256];
 #else
 	u128 Htable[16];
+	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #endif
 	unsigned int res, pad;
 	block128_f block;
 	void *key;
 };
+#if	TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
+	(defined(__i386)	|| defined(__i386__)	|| \
+	 defined(__x86_64)	|| defined(__x86_64__)	|| \
+	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
+# define GHASH_ASM_IAX
+extern unsigned int OPENSSL_ia32cap_P[2];
+void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+# if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#  define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+# endif
+# undef  GCM_MUL
+# define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
+# undef  GHASH
+# define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
+#endif
 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 {
 	const union { long one; char little; } is_endian = {1};
@@ -593,7 +601,29 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 #if	TABLE_BITS==8
 	gcm_init_8bit(ctx->Htable,ctx->H.u);
 #elif	TABLE_BITS==4
+# if	defined(GHASH_ASM_IAX)
+	if (OPENSSL_ia32cap_P[1]&(1<<1)) {
+		gcm_init_clmul(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_clmul;
+		ctx->ghash = gcm_ghash_clmul;
+		return;
+	}
 	gcm_init_4bit(ctx->Htable,ctx->H.u);
+#  if	defined(GHASH_ASM_X86)
+	if (OPENSSL_ia32cap_P[0]&(1<<23)) {
+		ctx->gmult = gcm_gmult_4bit_mmx;
+		ctx->ghash = gcm_ghash_4bit_mmx;
+	} else {
+		ctx->gmult = gcm_gmult_4bit_x86;
+		ctx->ghash = gcm_ghash_4bit_x86;
+	}
+#  else
+	ctx->gmult = gcm_gmult_4bit;
+	ctx->ghash = gcm_ghash_4bit;
+#  endif
+# else
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+# endif
 #endif
 }
@@ -671,7 +701,7 @@ void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
 #ifdef GHASH
 	if ((i = (len&(size_t)-16))) {
-		GHASH(aad,i,ctx);
+		GHASH(ctx,aad,i);
 		aad += i;
 		len -= i;
 	}
@@ -740,7 +770,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 			in  += 16;
 			j   -= 16;
 		    }
-		    GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx);
+		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
 		    len -= GHASH_CHUNK;
 		}
 		if ((i = (len&(size_t)-16))) {
@@ -760,7 +790,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 			in  += 16;
 			len -= 16;
 		    }
-		    GHASH(out-j,j,ctx);
+		    GHASH(ctx,out-j,j);
 		}
 #else
 		while (len>=16) {
@@ -854,7 +884,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
 		while (len>=GHASH_CHUNK) {
 		    size_t j=GHASH_CHUNK;
-		    GHASH(in,GHASH_CHUNK,ctx);
+		    GHASH(ctx,in,GHASH_CHUNK);
 		    while (j) {
 			(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
 			++ctr;
@@ -872,7 +902,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
 		    len -= GHASH_CHUNK;
 		}
 		if ((i = (len&(size_t)-16))) {
-		    GHASH(in,i,ctx);
+		    GHASH(ctx,in,i);
 		    while (len>=16) {
 			(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
 			++ctr;
@@ -1243,6 +1273,7 @@ int main()
 	{
 	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
 	union { u64 u; u8 c[1024]; } buf;
+	int i;
 	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
 	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
@@ -1267,11 +1298,11 @@ int main()
 			ctr_t/(double)sizeof(buf),
 			(gcm_t-ctr_t)/(double)sizeof(buf));
 #ifdef GHASH
-	GHASH(buf.c,sizeof(buf),&ctx);
+	GHASH(&ctx,buf.c,sizeof(buf));
 	start = OPENSSL_rdtsc();
-	GHASH(buf.c,sizeof(buf),&ctx);
+	for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
 	gcm_t = OPENSSL_rdtsc() - start;
-	printf("%.2f\n",gcm_t/(double)sizeof(buf));
+	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
 #endif
 	}
 #endif