aesv8-armx.pl 21.7 KB
Newer Older
R
Rich Salz 已提交
1
#! /usr/bin/env perl
M
Matt Caswell 已提交
2
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
R
Rich Salz 已提交
3 4 5 6 7 8
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

9 10 11 12 13 14 15 16 17 18 19 20
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for ARMv8 AES instructions. The
# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
21 22 23 24 25 26 27 28
# registers to 16, which implies additional NEON load and integer
# instructions. This has no effect on mighty Apple A7, where results
# are literally equal to the theoretical estimates based on AES
# instruction latencies and issue rates. On Cortex-A53, an in-order
# execution core, this costs up to 10-15%, which is partially
# compensated by implementing dedicated code path for 128-bit
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
29 30 31
#
# Performance in cycles per byte processed with 128-bit key:
#
A
Andy Polyakov 已提交
32 33
#		CBC enc		CBC dec		CTR
# Apple A7	2.39		1.20		1.20
34 35 36
# Cortex-A53	1.32		1.29		1.46
# Cortex-A57(*)	1.95		0.85		0.93
# Denver	1.96		0.86		0.80
37
# Mongoose	1.33		1.20		1.20
38
# Kryo		1.26		0.94		1.00
39 40 41
#
# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
#	and are still same even for updated module;
42 43

$flavour = shift;
44 45 46 47 48 49 50 51 52
$output  = shift;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
53

A
Andy Polyakov 已提交
54 55 56 57 58
$prefix="aes_v8";

$code=<<___;
#include "arm_arch.h"

59
#if __ARM_MAX_ARCH__>=7
A
Andy Polyakov 已提交
60 61
.text
___
62
$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
63 64 65 66 67 68
$code.=<<___						if ($flavour !~ /64/);
.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
.fpu	neon
.code	32
#undef	__thumb2__
___
69 70 71 72 73 74 75 76 77 78 79 80 81 82

# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
# maintain both 32- and 64-bit codes within single module and
# transliterate common code to either flavour with regex vodoo.
#
{{{
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));


$code.=<<___;
.align	5
83
.Lrcon:
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
.long	0x01,0x01,0x01,0x01
.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
.long	0x1b,0x1b,0x1b,0x1b

.globl	${prefix}_set_encrypt_key
.type	${prefix}_set_encrypt_key,%function
.align	5
${prefix}_set_encrypt_key:
.Lenc_key:
___
$code.=<<___	if ($flavour =~ /64/);
	stp	x29,x30,[sp,#-16]!
	add	x29,sp,#0
___
$code.=<<___;
99 100 101 102 103 104 105 106 107 108 109 110 111
	mov	$ptr,#-1
	cmp	$inp,#0
	b.eq	.Lenc_key_abort
	cmp	$out,#0
	b.eq	.Lenc_key_abort
	mov	$ptr,#-2
	cmp	$bits,#128
	b.lt	.Lenc_key_abort
	cmp	$bits,#256
	b.gt	.Lenc_key_abort
	tst	$bits,#0x3f
	b.ne	.Lenc_key_abort

112
	adr	$ptr,.Lrcon

	cmp	$bits,#192

	veor	$zero,$zero,$zero
	vld1.8	{$in0},[$inp],#16
	mov	$bits,#8		// reuse $bits
	vld1.32	{$rcon,$mask},[$ptr],#32

	b.lt	.Loop128
	b.eq	.L192
	b	.L256

.align	4
.Loop128:
	vtbl.8	$key,{$in0},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in0},[$out],#16
	aese	$key,$zero
	subs	$bits,$bits,#1

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	vshl.u8	$rcon,$rcon,#1
	veor	$in0,$in0,$key
	b.ne	.Loop128

	vld1.32	{$rcon},[$ptr]

	vtbl.8	$key,{$in0},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in0},[$out],#16
	aese	$key,$zero

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	vshl.u8	$rcon,$rcon,#1
	veor	$in0,$in0,$key

	vtbl.8	$key,{$in0},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in0},[$out],#16
	aese	$key,$zero

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	veor	$in0,$in0,$key
	vst1.32	{$in0},[$out]
	add	$out,$out,#0x50

	mov	$rounds,#10
	b	.Ldone

.align	4
.L192:
	vld1.8	{$in1},[$inp],#8
	vmov.i8	$key,#8			// borrow $key
	vst1.32	{$in0},[$out],#16
	vsub.i8	$mask,$mask,$key	// adjust the mask

.Loop192:
	vtbl.8	$key,{$in1},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in1},[$out],#8
	aese	$key,$zero
	subs	$bits,$bits,#1

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp

	vdup.32	$tmp,${in0}[3]
	veor	$tmp,$tmp,$in1
	 veor	$key,$key,$rcon
	vext.8	$in1,$zero,$in1,#12
	vshl.u8	$rcon,$rcon,#1
	veor	$in1,$in1,$tmp
	veor	$in0,$in0,$key
	veor	$in1,$in1,$key
	vst1.32	{$in0},[$out],#16
	b.ne	.Loop192

	mov	$rounds,#12
	add	$out,$out,#0x20
	b	.Ldone

.align	4
.L256:
	vld1.8	{$in1},[$inp]
	mov	$bits,#7
	mov	$rounds,#14
	vst1.32	{$in0},[$out],#16

.Loop256:
	vtbl.8	$key,{$in1},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in1},[$out],#16
	aese	$key,$zero
	subs	$bits,$bits,#1

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	vshl.u8	$rcon,$rcon,#1
	veor	$in0,$in0,$key
	vst1.32	{$in0},[$out],#16
	b.eq	.Ldone

	vdup.32	$key,${in0}[3]		// just splat
	vext.8	$tmp,$zero,$in1,#12
	aese	$key,$zero

	veor	$in1,$in1,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in1,$in1,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in1,$in1,$tmp

	veor	$in1,$in1,$key
	b	.Loop256

.Ldone:
	str	$rounds,[$out]
251
	mov	$ptr,#0
252

253 254
.Lenc_key_abort:
	mov	x0,$ptr			// return value
255 256 257 258 259 260 261 262 263 264
	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
	ret
.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key

.globl	${prefix}_set_decrypt_key
.type	${prefix}_set_decrypt_key,%function
.align	5
${prefix}_set_decrypt_key:
___
$code.=<<___	if ($flavour =~ /64/);
265
	.inst	0xd503233f		// paciasp
266 267 268 269 270 271 272 273 274
	stp	x29,x30,[sp,#-16]!
	add	x29,sp,#0
___
$code.=<<___	if ($flavour !~ /64/);
	stmdb	sp!,{r4,lr}
___
$code.=<<___;
	bl	.Lenc_key

275 276 277
	cmp	x0,#0
	b.ne	.Ldec_key_abort

278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
	sub	$out,$out,#240		// restore original $out
	mov	x4,#-16
	add	$inp,$out,x12,lsl#4	// end of key schedule

	vld1.32	{v0.16b},[$out]
	vld1.32	{v1.16b},[$inp]
	vst1.32	{v0.16b},[$inp],x4
	vst1.32	{v1.16b},[$out],#16

.Loop_imc:
	vld1.32	{v0.16b},[$out]
	vld1.32	{v1.16b},[$inp]
	aesimc	v0.16b,v0.16b
	aesimc	v1.16b,v1.16b
	vst1.32	{v0.16b},[$inp],x4
	vst1.32	{v1.16b},[$out],#16
	cmp	$inp,$out
	b.hi	.Loop_imc

	vld1.32	{v0.16b},[$out]
	aesimc	v0.16b,v0.16b
	vst1.32	{v0.16b},[$inp]

	eor	x0,x0,x0		// return value
302
.Ldec_key_abort:
303 304 305 306 307 308
___
$code.=<<___	if ($flavour !~ /64/);
	ldmia	sp!,{r4,pc}
___
$code.=<<___	if ($flavour =~ /64/);
	ldp	x29,x30,[sp],#16
309
	.inst	0xd50323bf		// autiasp
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
	ret
___
$code.=<<___;
.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
my ($inp,$out,$key)=map("x$_",(0..2));
my $rounds="w3";
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));

$code.=<<___;
.globl	${prefix}_${dir}crypt
.type	${prefix}_${dir}crypt,%function
.align	5
${prefix}_${dir}crypt:
	ldr	$rounds,[$key,#240]
	vld1.32	{$rndkey0},[$key],#16
	vld1.8	{$inout},[$inp]
	sub	$rounds,$rounds,#2
	vld1.32	{$rndkey1},[$key],#16

.Loop_${dir}c:
	aes$e	$inout,$rndkey0
337
	aes$mc	$inout,$inout
338
	vld1.32	{$rndkey0},[$key],#16
339 340
	subs	$rounds,$rounds,#2
	aes$e	$inout,$rndkey1
341
	aes$mc	$inout,$inout
342
	vld1.32	{$rndkey1},[$key],#16
343 344 345
	b.gt	.Loop_${dir}c

	aes$e	$inout,$rndkey0
346
	aes$mc	$inout,$inout
347
	vld1.32	{$rndkey0},[$key]
348 349 350 351 352 353 354 355 356 357 358 359 360
	aes$e	$inout,$rndkey1
	veor	$inout,$inout,$rndkey0

	vst1.8	{$inout},[$out]
	ret
.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
361
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
362 363 364
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));

my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
365
my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409

### q8-q15	preloaded key schedule

$code.=<<___;
.globl	${prefix}_cbc_encrypt
.type	${prefix}_cbc_encrypt,%function
.align	5
${prefix}_cbc_encrypt:
___
$code.=<<___	if ($flavour =~ /64/);
	stp	x29,x30,[sp,#-16]!
	add	x29,sp,#0
___
$code.=<<___	if ($flavour !~ /64/);
	mov	ip,sp
	stmdb	sp!,{r4-r8,lr}
	vstmdb	sp!,{d8-d15}            @ ABI specification says so
	ldmia	ip,{r4-r5}		@ load remaining args
___
$code.=<<___;
	subs	$len,$len,#16
	mov	$step,#16
	b.lo	.Lcbc_abort
	cclr	$step,eq

	cmp	$enc,#0			// en- or decrypting?
	ldr	$rounds,[$key,#240]
	and	$len,$len,#-16
	vld1.8	{$ivec},[$ivp]
	vld1.8	{$dat},[$inp],$step

	vld1.32	{q8-q9},[$key]		// load key schedule...
	sub	$rounds,$rounds,#6
	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
	sub	$rounds,$rounds,#2
	vld1.32	{q10-q11},[$key_],#32
	vld1.32	{q12-q13},[$key_],#32
	vld1.32	{q14-q15},[$key_],#32
	vld1.32	{$rndlast},[$key_]

	add	$key_,$key,#32
	mov	$cnt,$rounds
	b.eq	.Lcbc_dec

410
	cmp	$rounds,#2
411 412
	veor	$dat,$dat,$ivec
	veor	$rndzero_n_last,q8,$rndlast
413 414
	b.eq	.Lcbc_enc128

415 416 417 418 419 420 421 422 423 424 425
	vld1.32	{$in0-$in1},[$key_]
	add	$key_,$key,#16
	add	$key4,$key,#16*4
	add	$key5,$key,#16*5
	aese	$dat,q8
	aesmc	$dat,$dat
	add	$key6,$key,#16*6
	add	$key7,$key,#16*7
	b	.Lenter_cbc_enc

.align	4
426 427
.Loop_cbc_enc:
	aese	$dat,q8
428
	aesmc	$dat,$dat
429 430
	 vst1.8	{$ivec},[$out],#16
.Lenter_cbc_enc:
431
	aese	$dat,q9
432
	aesmc	$dat,$dat
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
	aese	$dat,$in0
	aesmc	$dat,$dat
	vld1.32	{q8},[$key4]
	cmp	$rounds,#4
	aese	$dat,$in1
	aesmc	$dat,$dat
	vld1.32	{q9},[$key5]
	b.eq	.Lcbc_enc192

	aese	$dat,q8
	aesmc	$dat,$dat
	vld1.32	{q8},[$key6]
	aese	$dat,q9
	aesmc	$dat,$dat
	vld1.32	{q9},[$key7]
	nop
449

450
.Lcbc_enc192:
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
	aese	$dat,q8
	aesmc	$dat,$dat
	 subs	$len,$len,#16
	aese	$dat,q9
	aesmc	$dat,$dat
	 cclr	$step,eq
	aese	$dat,q10
	aesmc	$dat,$dat
	aese	$dat,q11
	aesmc	$dat,$dat
	 vld1.8	{q8},[$inp],$step
	aese	$dat,q12
	aesmc	$dat,$dat
	 veor	q8,q8,$rndzero_n_last
	aese	$dat,q13
	aesmc	$dat,$dat
467
	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
468 469 470 471 472 473
	aese	$dat,q14
	aesmc	$dat,$dat
	aese	$dat,q15
	veor	$ivec,$dat,$rndlast
	b.hs	.Loop_cbc_enc

474
	vst1.8	{$ivec},[$out],#16
475 476
	b	.Lcbc_done

477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513
.align	5
.Lcbc_enc128:
	vld1.32	{$in0-$in1},[$key_]
	aese	$dat,q8
	aesmc	$dat,$dat
	b	.Lenter_cbc_enc128
.Loop_cbc_enc128:
	aese	$dat,q8
	aesmc	$dat,$dat
	 vst1.8	{$ivec},[$out],#16
.Lenter_cbc_enc128:
	aese	$dat,q9
	aesmc	$dat,$dat
	 subs	$len,$len,#16
	aese	$dat,$in0
	aesmc	$dat,$dat
	 cclr	$step,eq
	aese	$dat,$in1
	aesmc	$dat,$dat
	aese	$dat,q10
	aesmc	$dat,$dat
	aese	$dat,q11
	aesmc	$dat,$dat
	 vld1.8	{q8},[$inp],$step
	aese	$dat,q12
	aesmc	$dat,$dat
	aese	$dat,q13
	aesmc	$dat,$dat
	aese	$dat,q14
	aesmc	$dat,$dat
	 veor	q8,q8,$rndzero_n_last
	aese	$dat,q15
	veor	$ivec,$dat,$rndlast
	b.hs	.Loop_cbc_enc128

	vst1.8	{$ivec},[$out],#16
	b	.Lcbc_done
514 515 516 517
___
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
$code.=<<___;
518 519
.align	5
.Lcbc_dec:
520 521 522 523 524 525
	vld1.8	{$dat2},[$inp],#16
	subs	$len,$len,#32		// bias
	add	$cnt,$rounds,#2
	vorr	$in1,$dat,$dat
	vorr	$dat1,$dat,$dat
	vorr	$in2,$dat2,$dat2
526 527
	b.lo	.Lcbc_dec_tail

528 529 530
	vorr	$dat1,$dat2,$dat2
	vld1.8	{$dat2},[$inp],#16
	vorr	$in0,$dat,$dat
531
	vorr	$in1,$dat1,$dat1
532
	vorr	$in2,$dat2,$dat2
533

534
.Loop3x_cbc_dec:
535 536
	aesd	$dat0,q8
	aesimc	$dat0,$dat0
537
	aesd	$dat1,q8
538
	aesimc	$dat1,$dat1
539
	aesd	$dat2,q8
540
	aesimc	$dat2,$dat2
541
	vld1.32	{q8},[$key_],#16
542 543 544
	subs	$cnt,$cnt,#2
	aesd	$dat0,q9
	aesimc	$dat0,$dat0
545
	aesd	$dat1,q9
546
	aesimc	$dat1,$dat1
547
	aesd	$dat2,q9
548
	aesimc	$dat2,$dat2
549
	vld1.32	{q9},[$key_],#16
550
	b.gt	.Loop3x_cbc_dec
551 552 553

	aesd	$dat0,q8
	aesimc	$dat0,$dat0
554
	aesd	$dat1,q8
555
	aesimc	$dat1,$dat1
556
	aesd	$dat2,q8
557
	aesimc	$dat2,$dat2
558 559
	 veor	$tmp0,$ivec,$rndlast
	 subs	$len,$len,#0x30
560
	 veor	$tmp1,$in0,$rndlast
561
	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
562 563
	aesd	$dat0,q9
	aesimc	$dat0,$dat0
564
	aesd	$dat1,q9
565
	aesimc	$dat1,$dat1
566
	aesd	$dat2,q9
567
	aesimc	$dat2,$dat2
568
	 veor	$tmp2,$in1,$rndlast
569 570 571
	 add	$inp,$inp,x6		// $inp is adjusted in such way that
					// at exit from the loop $dat1-$dat2
					// are loaded with last "words"
572 573 574
	 vorr	$ivec,$in2,$in2
	 mov	$key_,$key
	aesd	$dat0,q12
575
	aesimc	$dat0,$dat0
576
	aesd	$dat1,q12
577
	aesimc	$dat1,$dat1
578
	aesd	$dat2,q12
579 580
	aesimc	$dat2,$dat2
	 vld1.8	{$in0},[$inp],#16
581
	aesd	$dat0,q13
582
	aesimc	$dat0,$dat0
583
	aesd	$dat1,q13
584
	aesimc	$dat1,$dat1
585
	aesd	$dat2,q13
586 587
	aesimc	$dat2,$dat2
	 vld1.8	{$in1},[$inp],#16
588 589
	aesd	$dat0,q14
	aesimc	$dat0,$dat0
590
	aesd	$dat1,q14
591
	aesimc	$dat1,$dat1
592
	aesd	$dat2,q14
593
	aesimc	$dat2,$dat2
594
	 vld1.8	{$in2},[$inp],#16
595 596
	aesd	$dat0,q15
	aesd	$dat1,q15
597
	aesd	$dat2,q15
598
	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
599
	 add	$cnt,$rounds,#2
600 601
	veor	$tmp0,$tmp0,$dat0
	veor	$tmp1,$tmp1,$dat1
602 603
	veor	$dat2,$dat2,$tmp2
	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
A
Andy Polyakov 已提交
604
	vst1.8	{$tmp0},[$out],#16
605
	 vorr	$dat0,$in0,$in0
A
Andy Polyakov 已提交
606
	vst1.8	{$tmp1},[$out],#16
607
	 vorr	$dat1,$in1,$in1
608 609 610
	vst1.8	{$dat2},[$out],#16
	 vorr	$dat2,$in2,$in2
	b.hs	.Loop3x_cbc_dec
611

612
	cmn	$len,#0x30
613
	b.eq	.Lcbc_done
614
	nop
615 616

.Lcbc_dec_tail:
617 618
	aesd	$dat1,q8
	aesimc	$dat1,$dat1
619
	aesd	$dat2,q8
620
	aesimc	$dat2,$dat2
621
	vld1.32	{q8},[$key_],#16
622
	subs	$cnt,$cnt,#2
623 624
	aesd	$dat1,q9
	aesimc	$dat1,$dat1
625
	aesd	$dat2,q9
626
	aesimc	$dat2,$dat2
627
	vld1.32	{q9},[$key_],#16
628 629
	b.gt	.Lcbc_dec_tail

630 631
	aesd	$dat1,q8
	aesimc	$dat1,$dat1
632
	aesd	$dat2,q8
633 634 635
	aesimc	$dat2,$dat2
	aesd	$dat1,q9
	aesimc	$dat1,$dat1
636
	aesd	$dat2,q9
637 638 639
	aesimc	$dat2,$dat2
	aesd	$dat1,q12
	aesimc	$dat1,$dat1
640
	aesd	$dat2,q12
641 642 643 644
	aesimc	$dat2,$dat2
	 cmn	$len,#0x20
	aesd	$dat1,q13
	aesimc	$dat1,$dat1
645
	aesd	$dat2,q13
646 647 648 649
	aesimc	$dat2,$dat2
	 veor	$tmp1,$ivec,$rndlast
	aesd	$dat1,q14
	aesimc	$dat1,$dat1
650
	aesd	$dat2,q14
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
	aesimc	$dat2,$dat2
	 veor	$tmp2,$in1,$rndlast
	aesd	$dat1,q15
	aesd	$dat2,q15
	b.eq	.Lcbc_dec_one
	veor	$tmp1,$tmp1,$dat1
	veor	$tmp2,$tmp2,$dat2
	 vorr	$ivec,$in2,$in2
	vst1.8	{$tmp1},[$out],#16
	vst1.8	{$tmp2},[$out],#16
	b	.Lcbc_done

.Lcbc_dec_one:
	veor	$tmp1,$tmp1,$dat2
	 vorr	$ivec,$in2,$in2
	vst1.8	{$tmp1},[$out],#16
667 668 669 670 671

.Lcbc_done:
	vst1.8	{$ivec},[$ivp]
.Lcbc_abort:
___
672
}
673 674 675 676 677 678 679 680 681 682 683 684
$code.=<<___	if ($flavour !~ /64/);
	vldmia	sp!,{d8-d15}
	ldmia	sp!,{r4-r8,pc}
___
$code.=<<___	if ($flavour =~ /64/);
	ldr	x29,[sp],#16
	ret
___
$code.=<<___;
.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
685 686
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
687 688 689 690
my ($rounds,$cnt,$key_)=("w5","w6","x7");
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
my $step="x12";		# aliases with $tctr2

691
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
692
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720

my ($dat,$tmp)=($dat0,$tmp0);

### q8-q15	preloaded key schedule

$code.=<<___;
.globl	${prefix}_ctr32_encrypt_blocks
.type	${prefix}_ctr32_encrypt_blocks,%function
.align	5
${prefix}_ctr32_encrypt_blocks:
___
$code.=<<___	if ($flavour =~ /64/);
	stp		x29,x30,[sp,#-16]!
	add		x29,sp,#0
___
$code.=<<___	if ($flavour !~ /64/);
	mov		ip,sp
	stmdb		sp!,{r4-r10,lr}
	vstmdb		sp!,{d8-d15}            @ ABI specification says so
	ldr		r4, [ip]		@ load remaining arg
___
$code.=<<___;
	ldr		$rounds,[$key,#240]

	ldr		$ctr, [$ivp, #12]
	vld1.32		{$dat0},[$ivp]

	vld1.32		{q8-q9},[$key]		// load key schedule...
721 722 723 724
	sub		$rounds,$rounds,#4
	mov		$step,#16
	cmp		$len,#2
	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
725 726 727 728 729 730
	sub		$rounds,$rounds,#2
	vld1.32		{q12-q13},[$key_],#32
	vld1.32		{q14-q15},[$key_],#32
	vld1.32		{$rndlast},[$key_]
	add		$key_,$key,#32
	mov		$cnt,$rounds
731
	cclr		$step,lo
A
Andy Polyakov 已提交
732
#ifndef __ARMEB__
733 734 735
	rev		$ctr, $ctr
#endif
	vorr		$dat1,$dat0,$dat0
736 737 738
	add		$tctr1, $ctr, #1
	vorr		$dat2,$dat0,$dat0
	add		$ctr, $ctr, #2
739
	vorr		$ivec,$dat0,$dat0
740
	rev		$tctr1, $tctr1
741
	vmov.32		${dat1}[3],$tctr1
742 743 744 745 746
	b.ls		.Lctr32_tail
	rev		$tctr2, $ctr
	sub		$len,$len,#3		// bias
	vmov.32		${dat2}[3],$tctr2
	b		.Loop3x_ctr32
747

748 749
.align	4
.Loop3x_ctr32:
750 751
	aese		$dat0,q8
	aesmc		$dat0,$dat0
752
	aese		$dat1,q8
753
	aesmc		$dat1,$dat1
754
	aese		$dat2,q8
755
	aesmc		$dat2,$dat2
756
	vld1.32		{q8},[$key_],#16
757 758 759
	subs		$cnt,$cnt,#2
	aese		$dat0,q9
	aesmc		$dat0,$dat0
760
	aese		$dat1,q9
761
	aesmc		$dat1,$dat1
762
	aese		$dat2,q9
763
	aesmc		$dat2,$dat2
764
	vld1.32		{q9},[$key_],#16
765
	b.gt		.Loop3x_ctr32
766 767 768

	aese		$dat0,q8
	aesmc		$tmp0,$dat0
769
	aese		$dat1,q8
770
	aesmc		$tmp1,$dat1
771
	 vld1.8		{$in0},[$inp],#16
772
	 vorr		$dat0,$ivec,$ivec
773 774
	aese		$dat2,q8
	aesmc		$dat2,$dat2
775
	 vld1.8		{$in1},[$inp],#16
776
	 vorr		$dat1,$ivec,$ivec
777
	aese		$tmp0,q9
778
	aesmc		$tmp0,$tmp0
779
	aese		$tmp1,q9
780
	aesmc		$tmp1,$tmp1
781 782 783
	 vld1.8		{$in2},[$inp],#16
	 mov		$key_,$key
	aese		$dat2,q9
784 785 786
	aesmc		$tmp2,$dat2
	 vorr		$dat2,$ivec,$ivec
	 add		$tctr0,$ctr,#1
787
	aese		$tmp0,q12
788
	aesmc		$tmp0,$tmp0
789
	aese		$tmp1,q12
790
	aesmc		$tmp1,$tmp1
791 792
	 veor		$in0,$in0,$rndlast
	 add		$tctr1,$ctr,#2
793
	aese		$tmp2,q12
794 795 796
	aesmc		$tmp2,$tmp2
	 veor		$in1,$in1,$rndlast
	 add		$ctr,$ctr,#3
797
	aese		$tmp0,q13
798
	aesmc		$tmp0,$tmp0
799
	aese		$tmp1,q13
800
	aesmc		$tmp1,$tmp1
801 802
	 veor		$in2,$in2,$rndlast
	 rev		$tctr0,$tctr0
803
	aese		$tmp2,q13
804 805 806
	aesmc		$tmp2,$tmp2
	 vmov.32	${dat0}[3], $tctr0
	 rev		$tctr1,$tctr1
807
	aese		$tmp0,q14
808
	aesmc		$tmp0,$tmp0
809
	aese		$tmp1,q14
810
	aesmc		$tmp1,$tmp1
811
	 vmov.32	${dat1}[3], $tctr1
812
	 rev		$tctr2,$ctr
813
	aese		$tmp2,q14
814 815 816
	aesmc		$tmp2,$tmp2
	 vmov.32	${dat2}[3], $tctr2
	 subs		$len,$len,#3
817 818
	aese		$tmp0,q15
	aese		$tmp1,q15
819
	aese		$tmp2,q15
820 821

	veor		$in0,$in0,$tmp0
822 823
	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
	vst1.8		{$in0},[$out],#16
824
	veor		$in1,$in1,$tmp1
825 826
	 mov		$cnt,$rounds
	vst1.8		{$in1},[$out],#16
827 828 829 830
	veor		$in2,$in2,$tmp2
	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
	vst1.8		{$in2},[$out],#16
	b.hs		.Loop3x_ctr32
831

832
	adds		$len,$len,#3
833
	b.eq		.Lctr32_done
834 835 836
	cmp		$len,#1
	mov		$step,#16
	cclr		$step,eq
837

838
.Lctr32_tail:
839 840
	aese		$dat0,q8
	aesmc		$dat0,$dat0
841
	aese		$dat1,q8
842
	aesmc		$dat1,$dat1
843
	vld1.32		{q8},[$key_],#16
844
	subs		$cnt,$cnt,#2
845 846
	aese		$dat0,q9
	aesmc		$dat0,$dat0
847
	aese		$dat1,q9
848
	aesmc		$dat1,$dat1
849
	vld1.32		{q9},[$key_],#16
850 851 852
	b.gt		.Lctr32_tail

	aese		$dat0,q8
853
	aesmc		$dat0,$dat0
854
	aese		$dat1,q8
855
	aesmc		$dat1,$dat1
856
	aese		$dat0,q9
857
	aesmc		$dat0,$dat0
858
	aese		$dat1,q9
859
	aesmc		$dat1,$dat1
860
	 vld1.8		{$in0},[$inp],$step
861 862
	aese		$dat0,q12
	aesmc		$dat0,$dat0
863
	aese		$dat1,q12
864
	aesmc		$dat1,$dat1
865
	 vld1.8		{$in1},[$inp]
866 867
	aese		$dat0,q13
	aesmc		$dat0,$dat0
868
	aese		$dat1,q13
869
	aesmc		$dat1,$dat1
870
	 veor		$in0,$in0,$rndlast
871
	aese		$dat0,q14
872
	aesmc		$dat0,$dat0
873
	aese		$dat1,q14
874 875
	aesmc		$dat1,$dat1
	 veor		$in1,$in1,$rndlast
876
	aese		$dat0,q15
877 878
	aese		$dat1,q15

879
	cmp		$len,#1
880 881 882 883
	veor		$in0,$in0,$dat0
	veor		$in1,$in1,$dat1
	vst1.8		{$in0},[$out],#16
	b.eq		.Lctr32_done
884
	vst1.8		{$in1},[$out]
885 886 887 888 889 890 891 892 893 894 895 896 897 898 899

.Lctr32_done:
___
$code.=<<___	if ($flavour !~ /64/);
	vldmia		sp!,{d8-d15}
	ldmia		sp!,{r4-r10,pc}
___
$code.=<<___	if ($flavour =~ /64/);
	ldr		x29,[sp],#16
	ret
___
$code.=<<___;
.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
A
Andy Polyakov 已提交
900 901 902
$code.=<<___;
#endif
___
903 904 905 906 907 908
########################################
if ($flavour =~ /64/) {			######## 64-bit code
    my %opcode = (
	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);

A
Andy Polyakov 已提交
909
    local *unaes = sub {
910 911 912
	my ($mnemonic,$arg)=@_;

	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
A
Andy Polyakov 已提交
913
	sprintf ".inst\t0x%08x\t//%s %s",
914 915
			$opcode{$mnemonic}|$1|($2<<5),
			$mnemonic,$arg;
A
Andy Polyakov 已提交
916
    };
917 918

    foreach(split("\n",$code)) {
919
	s/\`([^\`]*)\`/eval($1)/geo;
920 921

	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
922
	s/@\s/\/\//o;			# old->new style commentary
923 924 925

	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
926 927 928 929 930 931 932
	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
	s/vext\.8/ext/o		or
	s/vrev32\.8/rev32/o	or
	s/vtst\.8/cmtst/o	or
	s/vshr/ushr/o		or
	s/^(\s+)v/$1/o		or	# strip off v prefix
933 934
	s/\bbx\s+lr\b/ret/o;

J
Josh Soref 已提交
935
	# fix up remaining legacy suffixes
936 937
	s/\.[ui]?8//o;
	m/\],#8/o and s/\.16b/\.8b/go;
938 939
	s/\.[ui]?32//o and s/\.16b/\.4s/go;
	s/\.[ui]?64//o and s/\.16b/\.2d/go;
940 941
	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;

942
	print $_,"\n";
943 944 945 946 947 948
    }
} else {				######## 32-bit code
    my %opcode = (
	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);

A
Andy Polyakov 已提交
949
    local *unaes = sub {
950 951
	my ($mnemonic,$arg)=@_;

A
Andy Polyakov 已提交
952 953 954 955 956 957 958 959 960
	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
					 |(($2&7)<<1) |(($2&8)<<2);
	    # since ARMv7 instructions are always encoded little-endian.
	    # correct solution is to use .inst directive, but older
	    # assemblers don't implement it:-(
	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
			$word&0xff,($word>>8)&0xff,
			($word>>16)&0xff,($word>>24)&0xff,
961
			$mnemonic,$arg;
A
Andy Polyakov 已提交
962 963
	}
    };
964 965 966 967 968

    sub unvtbl {
	my $arg=shift;

	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
A
Andy Polyakov 已提交
969
	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
970
		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
971 972 973 974 975 976
    }

    sub unvdup32 {
	my $arg=shift;

	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
977
	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
978 979
    }

980 981 982 983
    sub unvmov32 {
	my $arg=shift;

	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
984
	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
985 986
    }

987
    foreach(split("\n",$code)) {
988
	s/\`([^\`]*)\`/eval($1)/geo;
989 990 991

	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
992
	s/\/\/\s?/@ /o;				# new->old style commentary
993

J
Josh Soref 已提交
994
	# fix up remaining new-style suffixes
A
Andy Polyakov 已提交
995
	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
996 997 998 999 1000 1001
	s/\],#[0-9]+/]!/o;

	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1002
	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1003
	s/^(\s+)b\./$1b/o				or
1004
	s/^(\s+)mov\./$1mov/o				or
1005 1006
	s/^(\s+)ret/$1bx\tlr/o;

1007
	print $_,"\n";
1008 1009 1010
    }
}

1011
close STDOUT or die "error closing STDOUT: $!";