ghash-armv4.pl 12.6 KB
Newer Older
A
Andy Polyakov 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
22 23 24 25 26 27
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
28 29 30 31 32 33 34 35 36 37
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
38 39 40 41 42 43 44 45 46 47 48 49 50 51
#
# April 2014
#
# Switch to multiplication algorithm suggested in paper referred
# below and combine it with reduction algorithm from x86 module.
# Performance improvement over previous version varies from 65% on
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
# in 9.33.
#
# Cmara, D.; Gouva, C. P. L.; Lpez, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
# 
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
52 53

# ====================================================================
54 55 56 57 58 59 60 61 62 63
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
#   bit shift operation is neatly fused with 128-bit xor here, and
#   "538B" variant would eliminate only 4-5 instructions out of 32
#   in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
#   consumption might be unappreciated (for so little improvement);
#
A
Andy Polyakov 已提交
64 65 66 67 68 69 70 71 72 73
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...

74 75 76
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

A
Andy Polyakov 已提交
77 78 79 80
$Xi="r0";	# argument block
$Htbl="r1";
$inp="r2";
$len="r3";
81

A
Andy Polyakov 已提交
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
$Zll="r4";	# variables
$Zlh="r5";
$Zhl="r6";
$Zhh="r7";
$Tll="r8";
$Tlh="r9";
$Thl="r10";
$Thh="r11";
$nlo="r12";
################# r13 is stack pointer
$nhi="r14";
################# r15 is program counter

$rem_4bit=$inp;	# used in gcm_gmult_4bit
$cnt=$len;

sub Zsmash() {
  my $i=12;
  my @args=@_;
  for ($Zll,$Zlh,$Zhl,$Zhh) {
    $code.=<<___;
103 104 105 106 107 108
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
	rev	$_,$_
	str	$_,[$Xi,#$i]
#elif defined(__ARMEB__)
	str	$_,[$Xi,#$i]
#else
A
Andy Polyakov 已提交
109 110 111 112 113 114 115
	mov	$Tlh,$_,lsr#8
	strb	$_,[$Xi,#$i+3]
	mov	$Thl,$_,lsr#16
	strb	$Tlh,[$Xi,#$i+2]
	mov	$Thh,$_,lsr#24
	strb	$Thl,[$Xi,#$i+1]
	strb	$Thh,[$Xi,#$i]
116
#endif
A
Andy Polyakov 已提交
117 118 119 120 121 122 123
___
    $code.="\t".shift(@args)."\n";
    $i-=4;
  }
}

$code=<<___;
124 125
#include "arm_arch.h"

A
Andy Polyakov 已提交
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
.text
.code	32

.type	rem_4bit,%object
.align	5
rem_4bit:
.short	0x0000,0x1C20,0x3840,0x2460
.short	0x7080,0x6CA0,0x48C0,0x54E0
.short	0xE100,0xFD20,0xD940,0xC560
.short	0x9180,0x8DA0,0xA9C0,0xB5E0
.size	rem_4bit,.-rem_4bit

.type	rem_4bit_get,%function
rem_4bit_get:
	sub	$rem_4bit,pc,#8
	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
	b	.Lrem_4bit_got
	nop
.size	rem_4bit_get,.-rem_4bit_get

.global	gcm_ghash_4bit
.type	gcm_ghash_4bit,%function
gcm_ghash_4bit:
	sub	r12,pc,#8
	add	$len,$inp,$len		@ $len to point at the end
	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
	sub	r12,r12,#48		@ &rem_4bit

	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
	stmdb	sp!,{r4-r11}		@ ... to stack

	ldrb	$nlo,[$inp,#15]
	ldrb	$nhi,[$Xi,#15]
.Louter:
	eor	$nlo,$nlo,$nhi
	and	$nhi,$nlo,#0xf0
	and	$nlo,$nlo,#0x0f
	mov	$cnt,#14

	add	$Zhh,$Htbl,$nlo,lsl#4
	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
167
	add	$Thh,$Htbl,$nhi
A
Andy Polyakov 已提交
168 169 170 171
	ldrb	$nlo,[$inp,#14]

	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
172
	add	$nhi,$nhi,$nhi
A
Andy Polyakov 已提交
173 174 175 176 177 178 179 180 181 182 183 184
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
	eor	$Zll,$Zll,$Zlh,lsl#28
	ldrb	$nhi,[$Xi,#14]
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	eor	$nlo,$nlo,$nhi
	and	$nhi,$nlo,#0xf0
	and	$nlo,$nlo,#0x0f
185
	eor	$Zhh,$Zhh,$Tll,lsl#16
A
Andy Polyakov 已提交
186

187
.Linner:
A
Andy Polyakov 已提交
188 189
	add	$Thh,$Htbl,$nlo,lsl#4
	and	$nlo,$Zll,#0xf		@ rem
190
	subs	$cnt,$cnt,#1
A
Andy Polyakov 已提交
191
	add	$nlo,$nlo,$nlo
192
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
A
Andy Polyakov 已提交
193 194 195 196
	eor	$Zll,$Tll,$Zll,lsr#4
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
197
	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
A
Andy Polyakov 已提交
198
	eor	$Zhl,$Thl,$Zhl,lsr#4
199
	ldrplb	$nlo,[$inp,$cnt]
A
Andy Polyakov 已提交
200 201 202 203 204
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4

	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
205
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
A
Andy Polyakov 已提交
206
	add	$nhi,$nhi,$nhi
207
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
A
Andy Polyakov 已提交
208
	eor	$Zll,$Tll,$Zll,lsr#4
209
	ldrplb	$Tll,[$Xi,$cnt]
A
Andy Polyakov 已提交
210 211
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
212
	ldrh	$Tlh,[sp,$nhi]
A
Andy Polyakov 已提交
213 214 215
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
216
	eorpl	$nlo,$nlo,$Tll
217
	eor	$Zhh,$Thh,$Zhh,lsr#4
A
Andy Polyakov 已提交
218 219
	andpl	$nhi,$nlo,#0xf0
	andpl	$nlo,$nlo,#0x0f
220 221
	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
	bpl	.Linner
A
Andy Polyakov 已提交
222 223 224 225 226 227 228 229 230 231

	ldr	$len,[sp,#32]		@ re-load $len/end
	add	$inp,$inp,#16
	mov	$nhi,$Zll
___
	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
$code.=<<___;
	bne	.Louter

	add	sp,sp,#36
232 233 234
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r11,pc}
#else
A
Andy Polyakov 已提交
235 236 237 238
	ldmia	sp!,{r4-r11,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
239
#endif
A
Andy Polyakov 已提交
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
.size	gcm_ghash_4bit,.-gcm_ghash_4bit

.global	gcm_gmult_4bit
.type	gcm_gmult_4bit,%function
gcm_gmult_4bit:
	stmdb	sp!,{r4-r11,lr}
	ldrb	$nlo,[$Xi,#15]
	b	rem_4bit_get
.Lrem_4bit_got:
	and	$nhi,$nlo,#0xf0
	and	$nlo,$nlo,#0x0f
	mov	$cnt,#14

	add	$Zhh,$Htbl,$nlo,lsl#4
	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
	ldrb	$nlo,[$Xi,#14]

	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
260
	add	$nhi,$nhi,$nhi
A
Andy Polyakov 已提交
261 262 263 264 265 266 267 268 269 270 271 272
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	and	$nhi,$nlo,#0xf0
	eor	$Zhh,$Zhh,$Tll,lsl#16
	and	$nlo,$nlo,#0x0f

273
.Loop:
A
Andy Polyakov 已提交
274 275
	add	$Thh,$Htbl,$nlo,lsl#4
	and	$nlo,$Zll,#0xf		@ rem
276
	subs	$cnt,$cnt,#1
A
Andy Polyakov 已提交
277
	add	$nlo,$nlo,$nlo
278
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
A
Andy Polyakov 已提交
279 280 281 282
	eor	$Zll,$Tll,$Zll,lsr#4
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
283
	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
A
Andy Polyakov 已提交
284
	eor	$Zhl,$Thl,$Zhl,lsr#4
285
	ldrplb	$nlo,[$Xi,$cnt]
A
Andy Polyakov 已提交
286 287 288 289 290
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4

	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
291
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
A
Andy Polyakov 已提交
292
	add	$nhi,$nhi,$nhi
293
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
A
Andy Polyakov 已提交
294 295 296
	eor	$Zll,$Tll,$Zll,lsr#4
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
297
	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
A
Andy Polyakov 已提交
298 299 300 301 302 303
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	andpl	$nhi,$nlo,#0xf0
	andpl	$nlo,$nlo,#0x0f
304
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
305
	bpl	.Loop
A
Andy Polyakov 已提交
306 307 308
___
	&Zsmash();
$code.=<<___;
309 310 311
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r11,pc}
#else
A
Andy Polyakov 已提交
312 313 314 315
	ldmia	sp!,{r4-r11,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
316
#endif
A
Andy Polyakov 已提交
317
.size	gcm_gmult_4bit,.-gcm_gmult_4bit
318 319
___
{
320 321 322
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
323

324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
sub clmul64x64 {
my ($r,$a,$b)=@_;
$code.=<<___;
	vext.8		$t0#lo, $a, $a, #1	@ A1
	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
	vext.8		$r#lo, $b, $b, #1	@ B1
	vmull.p8	$r, $a, $r#lo		@ E = A*B1
	vext.8		$t1#lo, $a, $a, #2	@ A2
	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
	vext.8		$t3#lo, $b, $b, #2	@ B2
	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
	vext.8		$t2#lo, $a, $a, #3	@ A3
	veor		$t0, $t0, $r		@ L = E + F
	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
	vext.8		$r#lo, $b, $b, #3	@ B3
	veor		$t1, $t1, $t3		@ M = G + H
	vmull.p8	$r, $a, $r#lo		@ I = A*B3
	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
	vand		$t0#hi, $t0#hi, $k48
	vext.8		$t3#lo, $b, $b, #4	@ B4
	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
	vand		$t1#hi, $t1#hi, $k32
	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
	veor		$t2, $t2, $r		@ N = I + J
	veor		$t0#lo, $t0#lo, $t0#hi
	veor		$t1#lo, $t1#lo, $t1#hi
	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
	vand		$t2#hi, $t2#hi, $k16
	vext.8		$t0, $t0, $t0, #15
	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
	vmov.i64	$t3#hi, #0
	vext.8		$t1, $t1, $t1, #14
	veor		$t2#lo, $t2#lo, $t2#hi
	vmull.p8	$r, $a, $b		@ D = A*B
	vext.8		$t3, $t3, $t3, #12
	vext.8		$t2, $t2, $t2, #13
	veor		$t0, $t0, $t1
	veor		$t2, $t2, $t3
	veor		$r, $r, $t0
	veor		$r, $r, $t2
___
}
366 367 368 369 370

$code.=<<___;
#if __ARM_ARCH__>=7
.fpu	neon

371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
.global	gcm_init_neon
.type	gcm_init_neon,%function
.align	4
gcm_init_neon:
	vld1.64		$IN#hi,[r1,:64]!	@ load H
	vmov.i8		$t0,#0xe1
	vld1.64		$IN#lo,[r1,:64]
	vshl.i64	$t0#hi,#57
	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
	vdup.8		$t1,$IN#hi[7]
	vshr.u64	$Hlo,$IN#lo,#63
	vshr.s8		$t1,#7			@ broadcast carry bit
	vshl.i64	$IN,$IN,#1
	vand		$t0,$t0,$t1
	vorr		$IN#hi,$Hlo		@ H<<<=1
	veor		$IN,$IN,$t0		@ twisted H
	vstmia		r0,{$IN}

389
	ret					@ bx lr
390 391
.size	gcm_init_neon,.-gcm_init_neon

392 393 394 395
.global	gcm_gmult_neon
.type	gcm_gmult_neon,%function
.align	4
gcm_gmult_neon:
396 397 398 399 400
	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi
	vld1.64		$IN#lo,[$Xi,:64]!
	vmov.i64	$k48,#0x0000ffffffffffff
	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
	vmov.i64	$k32,#0x00000000ffffffff
401 402 403
#ifdef __ARMEL__
	vrev64.8	$IN,$IN
#endif
404 405
	vmov.i64	$k16,#0x000000000000ffff
	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
406
	mov		$len,#16
407
	b		.Lgmult_neon
408 409 410 411 412 413
.size	gcm_gmult_neon,.-gcm_gmult_neon

.global	gcm_ghash_neon
.type	gcm_ghash_neon,%function
.align	4
gcm_ghash_neon:
414 415 416 417 418
	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi
	vld1.64		$Xl#lo,[$Xi,:64]!
	vmov.i64	$k48,#0x0000ffffffffffff
	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
	vmov.i64	$k32,#0x00000000ffffffff
419
#ifdef __ARMEL__
420
	vrev64.8	$Xl,$Xl
421
#endif
422 423 424 425 426 427
	vmov.i64	$k16,#0x000000000000ffff
	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing

.Loop_neon:
	vld1.64		$IN#hi,[$inp]!		@ load inp
	vld1.64		$IN#lo,[$inp]!
428 429 430
#ifdef __ARMEL__
	vrev64.8	$IN,$IN
#endif
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
	veor		$IN,$Xl			@ inp^=Xi
.Lgmult_neon:
___
	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.loXi.lo
$code.=<<___;
	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
___
	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)(Xi.lo+Xi.hi)
	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hiXi.hi
$code.=<<___;
	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
	veor		$Xm,$Xm,$Xh
	veor		$Xl#hi,$Xl#hi,$Xm#lo
	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result

	@ equivalent of reduction_avx from ghash-x86_64.pl
	vshl.i64	$t1,$Xl,#57		@ 1st phase
	vshl.i64	$t2,$Xl,#62
	veor		$t2,$t2,$t1		@
	vshl.i64	$t1,$Xl,#63
	veor		$t2, $t2, $t1		@
 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
	veor		$Xh#lo,$Xh#lo,$t2#hi

	vshr.u64	$t2,$Xl,#1		@ 2nd phase
	veor		$Xh,$Xh,$Xl
	veor		$Xl,$Xl,$t2		@
	vshr.u64	$t2,$t2,#6
	vshr.u64	$Xl,$Xl,#1		@
	veor		$Xl,$Xl,$Xh		@
	veor		$Xl,$Xl,$t2		@

463
	subs		$len,#16
464
	bne		.Loop_neon
465 466

#ifdef __ARMEL__
467
	vrev64.8	$Xl,$Xl
468 469
#endif
	sub		$Xi,#16	
470 471
	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
	vst1.64		$Xl#lo,[$Xi,:64]
472

473
	ret					@ bx lr
474 475 476 477 478 479
.size	gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
A
Andy Polyakov 已提交
480 481 482
.align  2
___

483 484 485 486
foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/geo;

	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
487
	s/\bret\b/bx	lr/go		or
488 489 490 491
	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4

	print $_,"\n";
}
A
Andy Polyakov 已提交
492
close STDOUT; # enforce flush