c64xplus-gf2m.pl 3.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# February 2012
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... The subroutine runs in 37 cycles, which is
# 4.5x faster than compiler-generated code. Though comparison is
# totally unfair, because this module utilizes Galois Field Multiply
# instruction.

while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector

($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
($A,$B)=($Alo,$B_1);
$xFF="B1";

sub mul_1x1_upper {
my ($A,$B)=@_;
$code.=<<___;
	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
||	AND	$B,$xFF,$B_0
||	SHRU	$B,24,$B_3
	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
||	EXTU	$A,16,16,$Alo

	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits muliplication
||	XORMPY	$Ahi,$B_2,$Ahix2
||	EXTU	$B,16,24,$B_1
	XORMPY	$Alo,$B_0,$Alox0
||	XORMPY	$Ahi,$B_0,$Ahix0
	XORMPY	$Alo,$B_3,$Alox3
||	XORMPY	$Ahi,$B_3,$Ahix3
	XORMPY	$Alo,$B_1,$Alox1
||	XORMPY	$Ahi,$B_1,$Ahix1
___
}
sub mul_1x1_merged {
my ($OUTlo,$OUThi,$A,$B)=@_;
$code.=<<___;
	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
||	 AND	$B,$xFF,$B_0
||	 SHRU	$B,24,$B_3
	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
||	 EXTU	$A,16,16,$Alo

	XOR	$Ahix0,$Alox2,$Ahix0
||	MV	$Ahix2,$OUThi
||	 XORMPY	$Alo,$B_2,$Alox2
	 XORMPY	$Ahi,$B_2,$Ahix2
||	 EXTU	$B,16,24,$B_1
||	 XORMPY	$Alo,$B_0,A1		; $Alox0
	XOR	$Ahix1,$Alox3,$Ahix1
||	SHL	$Ahix0,16,$OUTlo
||	SHRU	$Ahix0,16,$Ahix0
	XOR	$Alox0,$OUTlo,$OUTlo
||	XOR	$Ahix0,$OUThi,$OUThi
||	 XORMPY	$Ahi,$B_0,$Ahix0
||	 XORMPY	$Alo,$B_3,$Alox3
||	SHL	$Alox1,8,$Alox1
||	SHL	$Ahix3,8,$Ahix3
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix3,$OUThi,$OUThi
||	 XORMPY	$Ahi,$B_3,$Ahix3
||	SHL	$Ahix1,24,$Alox1
||	SHRU	$Ahix1,8, $Ahix1
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix1,$OUThi,$OUThi
||	 XORMPY	$Alo,$B_1,$Alox1
||	 XORMPY	$Ahi,$B_1,$Ahix1
||	 MV	A1,$Alox0
___
}
sub mul_1x1_lower {
my ($OUTlo,$OUThi)=@_;
$code.=<<___;
	;NOP
	XOR	$Ahix0,$Alox2,$Ahix0
||	MV	$Ahix2,$OUThi
	NOP
	XOR	$Ahix1,$Alox3,$Ahix1
||	SHL	$Ahix0,16,$OUTlo
||	SHRU	$Ahix0,16,$Ahix0
	XOR	$Alox0,$OUTlo,$OUTlo
||	XOR	$Ahix0,$OUThi,$OUThi
||	SHL	$Alox1,8,$Alox1
||	SHL	$Ahix3,8,$Ahix3
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix3,$OUThi,$OUThi
||	SHL	$Ahix1,24,$Alox1
||	SHRU	$Ahix1,8, $Ahix1
	XOR	$Alox1,$OUTlo,$OUTlo
||	XOR	$Ahix1,$OUThi,$OUThi
___
}
$code.=<<___;
	.text
110 111 112 113

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
114 115 116
	.if	__TI_EABI__
	.asg	bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
	.endif
117 118 119 120 121 122

	.global	_bn_GF2m_mul_2x2
_bn_GF2m_mul_2x2:
	.asmfunc
	MVK	0xFF,$xFF
___
123
	&mul_1x1_upper($a0,$b0);		# a0·b0
124 125 126 127
$code.=<<___;
||	MV	$b1,$B
	MV	$a1,$A
___
128
	&mul_1x1_merged("A28","B28",$A,$B);	# a0·b0/a1·b1
129 130 131 132
$code.=<<___;
||	XOR	$b0,$b1,$B
	XOR	$a0,$a1,$A
___
133
	&mul_1x1_merged("A31","B31",$A,$B);	# a1·b1/(a0+a1)·(b0+b1)
134 135
$code.=<<___;
	XOR	A28,A31,A29
136
||	XOR	B28,B31,B29			; a0·b0+a1·b1
137
___
138
	&mul_1x1_lower("A30","B30");		# (a0+a1)·(b0+b1)
139 140 141
$code.=<<___;
||	BNOP	B3
	XOR	A29,A30,A30
142
||	XOR	B29,B30,B30			; (a0+a1)·(b0+b1)-a0·b0-a1·b1
143 144 145 146 147 148 149 150 151 152 153
	XOR	B28,A30,A30
||	STW	A28,*${rp}[0]
	XOR	B30,A31,A31
||	STW	A30,*${rp}[1]
	STW	A31,*${rp}[2]
	STW	B31,*${rp}[3]
	.endasmfunc
___

print $code;
close STDOUT;