sha1-ia64.pl 8.8 KB
Newer Older
A
Andy Polyakov 已提交
1 2 3 4
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 6 7
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
A
Andy Polyakov 已提交
8 9 10 11 12 13 14
# ====================================================================
#
# Eternal question is what's wrong with compiler generated code? The
# trick is that it's possible to reduce the number of shifts required
# to perform rotations by maintaining copy of 32-bit value in upper
# bits of 64-bit register. Just follow mux2 and shrp instructions...
# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
15
# is >50% better than HP C and >2x better than gcc.
A
Andy Polyakov 已提交
16 17

$code=<<___;
18
.ident  \"sha1-ia64.s, version 1.2\"
A
Andy Polyakov 已提交
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
.ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit

___


if ($^O eq "hpux") {
    $ADDP="addp4";
    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV) {	$big_endian=1 if (/\-DB_ENDIAN/);
		$big_endian=0 if (/\-DL_ENDIAN/);   }
if (!defined($big_endian))
	    {	$big_endian=(unpack('L',pack('N',1))==1);   }

#$human=1;
if ($human) {	# useful for visual code auditing...
	($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
	($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
	    (	"K_00_19","K_20_39","K_40_59","K_60_79"	);
	@X= (	"X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
		"X8", "X9","X10","X11","X12","X13","X14","X15"	);
}
else {
	($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
	($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
	    (	"r14", "r15", "loc11", "loc12"	);
	@X= (	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
		"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"	);
}

sub BODY_00_15 {
local	*code=shift;
54
local	($i,$a,$b,$c,$d,$e,$f)=@_;
A
Andy Polyakov 已提交
55

56 57 58 59 60 61
$code.=<<___ if ($i==0);
{ .mmi;	ld1	$X[$i&0xf]=[inp],2	    // MSB
	ld1	tmp2=[tmp3],2		};;
{ .mmi;	ld1	tmp0=[inp],2
	ld1	tmp4=[tmp3],2		    // LSB
	dep	$X[$i&0xf]=$X[$i&0xf],tmp2,8,8	};;
A
Andy Polyakov 已提交
62 63 64
___
if ($i<15) {
	$code.=<<___;
65 66 67 68 69
{ .mmi;	ld1	$X[($i+1)&0xf]=[inp],2	    // +1
	dep	tmp1=tmp0,tmp4,8,8	};;
{ .mmi;	ld1	tmp2=[tmp3],2		    // +1
	and	tmp4=$c,$b
	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
A
Andy Polyakov 已提交
70
{ .mmi;	andcm	tmp1=$d,$b
71 72 73 74
	add	tmp0=$e,$K_00_19
	dep.z	tmp5=$a,5,27		};; // a<<5
{ .mmi;	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
A
Andy Polyakov 已提交
75
	extr.u	tmp1=$a,27,5		};; // a>>27
76 77
{ .mmi;	ld1	tmp0=[inp],2		    // +1
	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
A
Andy Polyakov 已提交
78
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
79 80
{ .mmi;	ld1	tmp4=[tmp3],2		    // +1
	or	tmp5=tmp1,tmp5		    // ROTATE(a,5)
A
Andy Polyakov 已提交
81
	mux2	tmp6=$a,0x44		};; // see b in next iteration
82 83 84
{ .mii;	add	$f=$f,tmp5		    // f+=ROTATE(a,5)
	dep	$X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8	// +1
	mux2	$X[$i&0xf]=$X[$i&0xf],0x44	} //;;
A
Andy Polyakov 已提交
85 86 87 88 89

___
	}
else	{
	$code.=<<___;
90 91 92
{ .mii;	and	tmp3=$c,$b
	dep	tmp1=tmp0,tmp4,8,8;;
	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
A
Andy Polyakov 已提交
93
{ .mmi;	andcm	tmp1=$d,$b
94 95 96 97
	add	tmp0=$e,$K_00_19
	dep.z	tmp5=$a,5,27		};; // a<<5
{ .mmi;	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
A
Andy Polyakov 已提交
98 99 100 101
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
	nop.i	0			};;
102
{ .mmi;	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
A
Andy Polyakov 已提交
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
	xor	tmp2=tmp2,tmp3		    // +1
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi; or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
	mux2	$X[$i&0xf]=$X[$i&0xf],0x44  };;

___
	}
}

sub BODY_16_19 {
local	*code=shift;
local	($i,$a,$b,$c,$d,$e,$f)=@_;

$code.=<<___;
{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
	and	tmp0=$c,$b
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mmi;	andcm	tmp1=$d,$b
	add	tmp4=$e,$K_00_19	};;
{ .mmi;	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
	add	$f=$f,tmp4		    // f+=e+K_00_19
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
	nop.i	0			};;
{ .mmi;	add	$f=$f,tmp0		    // f+=F_00_19(b,c,d)
	xor	tmp2=tmp2,tmp3		    // +1
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
	nop.i	0			};;

___
}

sub BODY_20_39 {
local	*code=shift;
local	($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
	$Konst = $K_20_39 if (!defined($Konst));

if ($i<79) {
$code.=<<___;
{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mib;	xor	tmp0=$c,$b
	add	tmp4=$e,$Konst		};;
{ .mmi;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
	add	$f=$f,tmp4		    // f+=e+K_20_39
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
	nop.i	0			};;
{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
	xor	tmp2=tmp2,tmp3		    // +1
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
	nop.i	0			};;

___
}
else {
$code.=<<___;
{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mib;	xor	tmp0=$c,$b
	add	tmp4=$e,$Konst		};;
{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mib;	add	$f=$f,tmp4		    // f+=e+K_20_39
	add	$h1=$h1,$a		};; // wrap up
181
{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
A
Andy Polyakov 已提交
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30) ;;?
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	add	$h3=$h3,$c		};; // wrap up
{ .mib;	add	tmp3=1,inp		    // used in unaligned codepath
	add	$f=$f,tmp1		}   // f+=ROTATE(a,5)
{ .mib;	add	$h2=$h2,$b		    // wrap up
	add	$h4=$h4,$d		};; // wrap up

___
}
}

sub BODY_40_59 {
local	*code=shift;
local	($i,$a,$b,$c,$d,$e,$f)=@_;

$code.=<<___;
{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
	and	tmp0=$c,$b
	dep.z	tmp5=$a,5,27		}   // a<<5
{ .mmi;	and	tmp1=$d,$b
	add	tmp4=$e,$K_40_59	};;
{ .mmi;	or	tmp0=tmp0,tmp1		    // (b&c)|(b&d)
	add	$f=$f,tmp4		    // f+=e+K_40_59
	extr.u	tmp1=$a,27,5		}   // a>>27
{ .mmi;	and	tmp4=$c,$d
	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
	};;
{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
	xor	tmp2=tmp2,tmp3		    // +1
	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
{ .mmi;	or	tmp0=tmp0,tmp4		    // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
	mux2	tmp6=$a,0x44		};; // see b in next iteration
{ .mii;	add	$f=$f,tmp0		    // f+=F_40_59(b,c,d)
	shrp	$e=tmp2,tmp2,31;;	    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
	add	$f=$f,tmp1		};; // f+=ROTATE(a,5)

___
}
sub BODY_60_79	{ &BODY_20_39(@_,$K_60_79); }

$code.=<<___;
.text

tmp0=r8;
tmp1=r9;
tmp2=r10;
tmp3=r11;
ctx=r32;	// in0
inp=r33;	// in1

234 235 236
// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
.global	sha1_block_data_order#
.proc	sha1_block_data_order#
A
Andy Polyakov 已提交
237
.align	32
238
sha1_block_data_order:
A
Andy Polyakov 已提交
239 240 241
	.prologue
{ .mmi;	alloc	tmp1=ar.pfs,3,15,0,0
	$ADDP	tmp0=4,ctx
242
	.save	ar.lc,r3
A
Andy Polyakov 已提交
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
	mov	r3=ar.lc		}
{ .mmi;	$ADDP	ctx=0,ctx
	$ADDP	inp=0,inp
	mov	r2=pr			};;
tmp4=in2;
tmp5=loc13;
tmp6=loc14;
	.body
{ .mlx;	ld4	$h0=[ctx],8
	movl	$K_00_19=0x5a827999	}
{ .mlx;	ld4	$h1=[tmp0],8
	movl	$K_20_39=0x6ed9eba1	};;
{ .mlx;	ld4	$h2=[ctx],8
	movl	$K_40_59=0x8f1bbcdc	}
{ .mlx;	ld4	$h3=[tmp0]
	movl	$K_60_79=0xca62c1d6	};;
{ .mmi;	ld4	$h4=[ctx],-16
	add	in2=-1,in2		    // adjust num for ar.lc
	mov	ar.ec=1			};;
{ .mmi;	nop.m	0
	add	tmp3=1,inp
	mov	ar.lc=in2		};; // brp.loop.imp: too far

.Ldtop:
{ .mmi;	mov	$A=$h0
	mov	$B=$h1
	mux2	tmp6=$h1,0x44		}
{ .mmi;	mov	$C=$h2
	mov	$D=$h3
	mov	$E=$h4			};;

___

276
{ my $i,@V=($A,$B,$C,$D,$E,$T);
A
Andy Polyakov 已提交
277

278
	for($i=0;$i<16;$i++)	{ &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
279 280 281 282
	for(;$i<20;$i++)	{ &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
	for(;$i<40;$i++)	{ &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
	for(;$i<60;$i++)	{ &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
	for(;$i<80;$i++)	{ &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
A
Andy Polyakov 已提交
283

284 285
	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
}
A
Andy Polyakov 已提交
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300

$code.=<<___;
{ .mmb;	add	$h0=$h0,$E
	nop.m	0
	br.ctop.dptk.many	.Ldtop	};;
.Ldend:
{ .mmi;	add	tmp0=4,ctx
	mov	ar.lc=r3		};;
{ .mmi;	st4	[ctx]=$h0,8
	st4	[tmp0]=$h1,8		};;
{ .mmi;	st4	[ctx]=$h2,8
	st4	[tmp0]=$h3		};;
{ .mib;	st4	[ctx]=$h4,-16
	mov	pr=r2,0x1ffff
	br.ret.sptk.many	b0	};;
301
.endp	sha1_block_data_order#
302
stringz	"SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
A
Andy Polyakov 已提交
303 304 305
___

print $code;