Add ghash-alpha.pl assembler module.

42feba47 · Andy Polyakov · 3c01a1e8 · 42feba47
隐藏空白更改
内联并排

Showing with 453 addition and 0 deletion

crypto/modes/asm/ghash-alpha.pl crypto/modes/asm/ghash-alpha.pl +453 -0

未找到文件。
--- a/crypto/modes/asm/ghash-alpha.pl
+++ b/crypto/modes/asm/ghash-alpha.pl
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";	# $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";	# $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";	# $8
+#################
+$Xi="a0";	# $16
+$Htbl="a1";
+
+
+$nlo="a4";	# $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";	# $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";	# $28
+
+{ my $N;
+  sub loop() {
+
+	$N++;
+$code.=<<___;
+.align	4
+	extbl	$Xlo,7,$nlo
+	and	$nlo,0xf0,$nhi
+	sll	$nlo,4,$nlo
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Zlo,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Zhi,0($nlo)
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,6(zero)
+	extbl	$Xlo,6,$nlo
+
+	ldq	$Tlo1,8($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+	ldq	$Thi1,0($nhi)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	and	$nlo,0xf0,$nhi
+
+	xor	$Tlo1,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	xor	$Thi1,$Zhi,$Zhi
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Tlo0,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Thi0,0($nlo)
+
+.Looplo$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xlo,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Looplo$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,7(zero)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	unop
+
+
+.Loophi$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Loophi$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo0,$Zlo,$Zlo
+	xor	$Thi0,$Zhi,$Zhi
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	xor	$rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	gcm_gmult_4bit
+.align	4
+.ent	gcm_gmult_4bit
+gcm_gmult_4bit:
+	.frame	sp,0,ra
+	.prologue 0
+
+	ldq	$Xlo,8($Xi)
+	ldq	$Xhi,0($Xi)
+
+	br	$rem_4bit,.Lpic1
+.Lpic1:	lda	$rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	ret	(ra)
+.end	gcm_gmult_4bit
+___
+
+# argument block for gcm_ghash_4bit
+$inp="a0";	# $16
+$len="a1";
+$Xi ="a2";
+$Htbl="a3";
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.align	4
+.ent	gcm_ghash_4bit
+gcm_ghash_4bit:
+	lda	sp,-32(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	.mask	0x04000600,-32
+	.frame	sp,32,ra
+	.prologue 0
+
+	ldq_u	$inhi,0($inp)
+	ldq_u	$Thi0,7($inp)
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+	ldq	$Xhi,0($Xi)
+	ldq	$Xlo,8($Xi)
+
+	br	$rem_4bit,.Lpic2
+.Lpic2:	lda	$rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+
+.Louter:
+	extql	$inhi,$inp,$inhi
+	extqh	$Thi0,$inp,$Thi0
+	or	$inhi,$Thi0,$inhi
+	lda	$inp,16($inp)
+
+	extql	$inlo,$inp,$inlo
+	extqh	$Tlo0,$inp,$Tlo0
+	or	$inlo,$Tlo0,$inlo
+	subq	$len,16,$len
+
+	xor	$Xlo,$inlo,$Xlo
+	xor	$Xhi,$inhi,$Xhi
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+	beq	$len,.Ldone
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+	ldq_u	$inhi,0($inp)
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+	ldq_u	$Thi0,7($inp)
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	br	zero,.Louter
+
+.Ldone:
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	.set	noreorder
+	/*ldq	ra,0(sp)*/
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	lda	sp,32(sp)
+	ret	(ra)
+.end	gcm_ghash_4bit
+
+.align	4
+rem_4bit:
+	.quad	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+	.quad	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+	.quad	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+	.quad	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.asciiz "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+