aesv8-armx.pl 21.8 KB
Newer Older
R
Rich Salz 已提交
1
#! /usr/bin/env perl
M
Matt Caswell 已提交
2
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
R
Rich Salz 已提交
3 4 5 6 7 8
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

9 10 11 12 13 14 15 16 17 18 19 20
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for ARMv8 AES instructions. The
# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
21 22 23 24 25 26 27 28
# registers to 16, which implies additional NEON load and integer
# instructions. This has no effect on mighty Apple A7, where results
# are literally equal to the theoretical estimates based on AES
# instruction latencies and issue rates. On Cortex-A53, an in-order
# execution core, this costs up to 10-15%, which is partially
# compensated by implementing dedicated code path for 128-bit
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
29 30 31
#
# Performance in cycles per byte processed with 128-bit key:
#
A
Andy Polyakov 已提交
32 33
#		CBC enc		CBC dec		CTR
# Apple A7	2.39		1.20		1.20
34 35 36
# Cortex-A53	1.32		1.29		1.46
# Cortex-A57(*)	1.95		0.85		0.93
# Denver	1.96		0.86		0.80
37
# Mongoose	1.33		1.20		1.20
38
# Kryo		1.26		0.94		1.00
39 40 41
#
# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
#	and are still same even for updated module;
42 43

$flavour = shift;
44 45 46 47 48 49 50 51 52
$output  = shift;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
53

A
Andy Polyakov 已提交
54 55 56 57 58
$prefix="aes_v8";

$code=<<___;
#include "arm_arch.h"

59
#if __ARM_MAX_ARCH__>=7
A
Andy Polyakov 已提交
60 61
.text
___
62
$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
63 64 65 66 67 68
$code.=<<___						if ($flavour !~ /64/);
.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
.fpu	neon
.code	32
#undef	__thumb2__
___
69 70 71 72 73 74 75 76 77 78 79 80 81 82

# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
# maintain both 32- and 64-bit codes within single module and
# transliterate common code to either flavour with regex vodoo.
#
{{{
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));


$code.=<<___;
.align	5
83
.Lrcon:
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
.long	0x01,0x01,0x01,0x01
.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
.long	0x1b,0x1b,0x1b,0x1b

.globl	${prefix}_set_encrypt_key
.type	${prefix}_set_encrypt_key,%function
.align	5
${prefix}_set_encrypt_key:
.Lenc_key:
___
$code.=<<___	if ($flavour =~ /64/);
	stp	x29,x30,[sp,#-16]!
	add	x29,sp,#0
___
$code.=<<___;
99 100 101 102 103 104 105 106 107 108 109 110 111
	mov	$ptr,#-1
	cmp	$inp,#0
	b.eq	.Lenc_key_abort
	cmp	$out,#0
	b.eq	.Lenc_key_abort
	mov	$ptr,#-2
	cmp	$bits,#128
	b.lt	.Lenc_key_abort
	cmp	$bits,#256
	b.gt	.Lenc_key_abort
	tst	$bits,#0x3f
	b.ne	.Lenc_key_abort

112
	adr	$ptr,.Lrcon
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
	cmp	$bits,#192

	veor	$zero,$zero,$zero
	vld1.8	{$in0},[$inp],#16
	mov	$bits,#8		// reuse $bits
	vld1.32	{$rcon,$mask},[$ptr],#32

	b.lt	.Loop128
	b.eq	.L192
	b	.L256

.align	4
.Loop128:
	vtbl.8	$key,{$in0},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in0},[$out],#16
	aese	$key,$zero
	subs	$bits,$bits,#1

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	vshl.u8	$rcon,$rcon,#1
	veor	$in0,$in0,$key
	b.ne	.Loop128

	vld1.32	{$rcon},[$ptr]

	vtbl.8	$key,{$in0},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in0},[$out],#16
	aese	$key,$zero

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	vshl.u8	$rcon,$rcon,#1
	veor	$in0,$in0,$key

	vtbl.8	$key,{$in0},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in0},[$out],#16
	aese	$key,$zero

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	veor	$in0,$in0,$key
	vst1.32	{$in0},[$out]
	add	$out,$out,#0x50

	mov	$rounds,#10
	b	.Ldone

.align	4
.L192:
	vld1.8	{$in1},[$inp],#8
	vmov.i8	$key,#8			// borrow $key
	vst1.32	{$in0},[$out],#16
	vsub.i8	$mask,$mask,$key	// adjust the mask

.Loop192:
	vtbl.8	$key,{$in1},$mask
	vext.8	$tmp,$zero,$in0,#12
H
HJ 已提交
186 187 188 189
#ifdef __ARMEB__
	vst1.32	{$in1},[$out],#16
	sub	$out,$out,#8
#else
190
	vst1.32	{$in1},[$out],#8
H
HJ 已提交
191
#endif
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	aese	$key,$zero
	subs	$bits,$bits,#1

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp

	vdup.32	$tmp,${in0}[3]
	veor	$tmp,$tmp,$in1
	 veor	$key,$key,$rcon
	vext.8	$in1,$zero,$in1,#12
	vshl.u8	$rcon,$rcon,#1
	veor	$in1,$in1,$tmp
	veor	$in0,$in0,$key
	veor	$in1,$in1,$key
	vst1.32	{$in0},[$out],#16
	b.ne	.Loop192

	mov	$rounds,#12
	add	$out,$out,#0x20
	b	.Ldone

.align	4
.L256:
	vld1.8	{$in1},[$inp]
	mov	$bits,#7
	mov	$rounds,#14
	vst1.32	{$in0},[$out],#16

.Loop256:
	vtbl.8	$key,{$in1},$mask
	vext.8	$tmp,$zero,$in0,#12
	vst1.32	{$in1},[$out],#16
	aese	$key,$zero
	subs	$bits,$bits,#1

	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in0,$in0,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	 veor	$key,$key,$rcon
	veor	$in0,$in0,$tmp
	vshl.u8	$rcon,$rcon,#1
	veor	$in0,$in0,$key
	vst1.32	{$in0},[$out],#16
	b.eq	.Ldone

	vdup.32	$key,${in0}[3]		// just splat
	vext.8	$tmp,$zero,$in1,#12
	aese	$key,$zero

	veor	$in1,$in1,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in1,$in1,$tmp
	vext.8	$tmp,$zero,$tmp,#12
	veor	$in1,$in1,$tmp

	veor	$in1,$in1,$key
	b	.Loop256

.Ldone:
	str	$rounds,[$out]
256
	mov	$ptr,#0
257

258 259
.Lenc_key_abort:
	mov	x0,$ptr			// return value
260 261 262 263 264 265 266 267 268 269
	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
	ret
.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key

.globl	${prefix}_set_decrypt_key
.type	${prefix}_set_decrypt_key,%function
.align	5
${prefix}_set_decrypt_key:
___
$code.=<<___	if ($flavour =~ /64/);
270
	.inst	0xd503233f		// paciasp
271 272 273 274 275 276 277 278 279
	stp	x29,x30,[sp,#-16]!
	add	x29,sp,#0
___
$code.=<<___	if ($flavour !~ /64/);
	stmdb	sp!,{r4,lr}
___
$code.=<<___;
	bl	.Lenc_key

280 281 282
	cmp	x0,#0
	b.ne	.Ldec_key_abort

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
	sub	$out,$out,#240		// restore original $out
	mov	x4,#-16
	add	$inp,$out,x12,lsl#4	// end of key schedule

	vld1.32	{v0.16b},[$out]
	vld1.32	{v1.16b},[$inp]
	vst1.32	{v0.16b},[$inp],x4
	vst1.32	{v1.16b},[$out],#16

.Loop_imc:
	vld1.32	{v0.16b},[$out]
	vld1.32	{v1.16b},[$inp]
	aesimc	v0.16b,v0.16b
	aesimc	v1.16b,v1.16b
	vst1.32	{v0.16b},[$inp],x4
	vst1.32	{v1.16b},[$out],#16
	cmp	$inp,$out
	b.hi	.Loop_imc

	vld1.32	{v0.16b},[$out]
	aesimc	v0.16b,v0.16b
	vst1.32	{v0.16b},[$inp]

	eor	x0,x0,x0		// return value
307
.Ldec_key_abort:
308 309 310 311 312 313
___
$code.=<<___	if ($flavour !~ /64/);
	ldmia	sp!,{r4,pc}
___
$code.=<<___	if ($flavour =~ /64/);
	ldp	x29,x30,[sp],#16
314
	.inst	0xd50323bf		// autiasp
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
	ret
___
$code.=<<___;
.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
my ($inp,$out,$key)=map("x$_",(0..2));
my $rounds="w3";
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));

$code.=<<___;
.globl	${prefix}_${dir}crypt
.type	${prefix}_${dir}crypt,%function
.align	5
${prefix}_${dir}crypt:
	ldr	$rounds,[$key,#240]
	vld1.32	{$rndkey0},[$key],#16
	vld1.8	{$inout},[$inp]
	sub	$rounds,$rounds,#2
	vld1.32	{$rndkey1},[$key],#16

.Loop_${dir}c:
	aes$e	$inout,$rndkey0
342
	aes$mc	$inout,$inout
343
	vld1.32	{$rndkey0},[$key],#16
344 345
	subs	$rounds,$rounds,#2
	aes$e	$inout,$rndkey1
346
	aes$mc	$inout,$inout
347
	vld1.32	{$rndkey1},[$key],#16
348 349 350
	b.gt	.Loop_${dir}c

	aes$e	$inout,$rndkey0
351
	aes$mc	$inout,$inout
352
	vld1.32	{$rndkey0},[$key]
353 354 355 356 357 358 359 360 361 362 363 364 365
	aes$e	$inout,$rndkey1
	veor	$inout,$inout,$rndkey0

	vst1.8	{$inout},[$out]
	ret
.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
366
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
367 368 369
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));

my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
370
my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414

### q8-q15	preloaded key schedule

$code.=<<___;
.globl	${prefix}_cbc_encrypt
.type	${prefix}_cbc_encrypt,%function
.align	5
${prefix}_cbc_encrypt:
___
$code.=<<___	if ($flavour =~ /64/);
	stp	x29,x30,[sp,#-16]!
	add	x29,sp,#0
___
$code.=<<___	if ($flavour !~ /64/);
	mov	ip,sp
	stmdb	sp!,{r4-r8,lr}
	vstmdb	sp!,{d8-d15}            @ ABI specification says so
	ldmia	ip,{r4-r5}		@ load remaining args
___
$code.=<<___;
	subs	$len,$len,#16
	mov	$step,#16
	b.lo	.Lcbc_abort
	cclr	$step,eq

	cmp	$enc,#0			// en- or decrypting?
	ldr	$rounds,[$key,#240]
	and	$len,$len,#-16
	vld1.8	{$ivec},[$ivp]
	vld1.8	{$dat},[$inp],$step

	vld1.32	{q8-q9},[$key]		// load key schedule...
	sub	$rounds,$rounds,#6
	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
	sub	$rounds,$rounds,#2
	vld1.32	{q10-q11},[$key_],#32
	vld1.32	{q12-q13},[$key_],#32
	vld1.32	{q14-q15},[$key_],#32
	vld1.32	{$rndlast},[$key_]

	add	$key_,$key,#32
	mov	$cnt,$rounds
	b.eq	.Lcbc_dec

415
	cmp	$rounds,#2
416 417
	veor	$dat,$dat,$ivec
	veor	$rndzero_n_last,q8,$rndlast
418 419
	b.eq	.Lcbc_enc128

420 421 422 423 424 425 426 427 428 429 430
	vld1.32	{$in0-$in1},[$key_]
	add	$key_,$key,#16
	add	$key4,$key,#16*4
	add	$key5,$key,#16*5
	aese	$dat,q8
	aesmc	$dat,$dat
	add	$key6,$key,#16*6
	add	$key7,$key,#16*7
	b	.Lenter_cbc_enc

.align	4
431 432
.Loop_cbc_enc:
	aese	$dat,q8
433
	aesmc	$dat,$dat
434 435
	 vst1.8	{$ivec},[$out],#16
.Lenter_cbc_enc:
436
	aese	$dat,q9
437
	aesmc	$dat,$dat
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
	aese	$dat,$in0
	aesmc	$dat,$dat
	vld1.32	{q8},[$key4]
	cmp	$rounds,#4
	aese	$dat,$in1
	aesmc	$dat,$dat
	vld1.32	{q9},[$key5]
	b.eq	.Lcbc_enc192

	aese	$dat,q8
	aesmc	$dat,$dat
	vld1.32	{q8},[$key6]
	aese	$dat,q9
	aesmc	$dat,$dat
	vld1.32	{q9},[$key7]
	nop
454

455
.Lcbc_enc192:
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
	aese	$dat,q8
	aesmc	$dat,$dat
	 subs	$len,$len,#16
	aese	$dat,q9
	aesmc	$dat,$dat
	 cclr	$step,eq
	aese	$dat,q10
	aesmc	$dat,$dat
	aese	$dat,q11
	aesmc	$dat,$dat
	 vld1.8	{q8},[$inp],$step
	aese	$dat,q12
	aesmc	$dat,$dat
	 veor	q8,q8,$rndzero_n_last
	aese	$dat,q13
	aesmc	$dat,$dat
472
	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
473 474 475 476 477 478
	aese	$dat,q14
	aesmc	$dat,$dat
	aese	$dat,q15
	veor	$ivec,$dat,$rndlast
	b.hs	.Loop_cbc_enc

479
	vst1.8	{$ivec},[$out],#16
480 481
	b	.Lcbc_done

482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518
.align	5
.Lcbc_enc128:
	vld1.32	{$in0-$in1},[$key_]
	aese	$dat,q8
	aesmc	$dat,$dat
	b	.Lenter_cbc_enc128
.Loop_cbc_enc128:
	aese	$dat,q8
	aesmc	$dat,$dat
	 vst1.8	{$ivec},[$out],#16
.Lenter_cbc_enc128:
	aese	$dat,q9
	aesmc	$dat,$dat
	 subs	$len,$len,#16
	aese	$dat,$in0
	aesmc	$dat,$dat
	 cclr	$step,eq
	aese	$dat,$in1
	aesmc	$dat,$dat
	aese	$dat,q10
	aesmc	$dat,$dat
	aese	$dat,q11
	aesmc	$dat,$dat
	 vld1.8	{q8},[$inp],$step
	aese	$dat,q12
	aesmc	$dat,$dat
	aese	$dat,q13
	aesmc	$dat,$dat
	aese	$dat,q14
	aesmc	$dat,$dat
	 veor	q8,q8,$rndzero_n_last
	aese	$dat,q15
	veor	$ivec,$dat,$rndlast
	b.hs	.Loop_cbc_enc128

	vst1.8	{$ivec},[$out],#16
	b	.Lcbc_done
519 520 521 522
___
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
$code.=<<___;
523 524
.align	5
.Lcbc_dec:
525 526 527 528 529 530
	vld1.8	{$dat2},[$inp],#16
	subs	$len,$len,#32		// bias
	add	$cnt,$rounds,#2
	vorr	$in1,$dat,$dat
	vorr	$dat1,$dat,$dat
	vorr	$in2,$dat2,$dat2
531 532
	b.lo	.Lcbc_dec_tail

533 534 535
	vorr	$dat1,$dat2,$dat2
	vld1.8	{$dat2},[$inp],#16
	vorr	$in0,$dat,$dat
536
	vorr	$in1,$dat1,$dat1
537
	vorr	$in2,$dat2,$dat2
538

539
.Loop3x_cbc_dec:
540 541
	aesd	$dat0,q8
	aesimc	$dat0,$dat0
542
	aesd	$dat1,q8
543
	aesimc	$dat1,$dat1
544
	aesd	$dat2,q8
545
	aesimc	$dat2,$dat2
546
	vld1.32	{q8},[$key_],#16
547 548 549
	subs	$cnt,$cnt,#2
	aesd	$dat0,q9
	aesimc	$dat0,$dat0
550
	aesd	$dat1,q9
551
	aesimc	$dat1,$dat1
552
	aesd	$dat2,q9
553
	aesimc	$dat2,$dat2
554
	vld1.32	{q9},[$key_],#16
555
	b.gt	.Loop3x_cbc_dec
556 557 558

	aesd	$dat0,q8
	aesimc	$dat0,$dat0
559
	aesd	$dat1,q8
560
	aesimc	$dat1,$dat1
561
	aesd	$dat2,q8
562
	aesimc	$dat2,$dat2
563 564
	 veor	$tmp0,$ivec,$rndlast
	 subs	$len,$len,#0x30
565
	 veor	$tmp1,$in0,$rndlast
566
	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
567 568
	aesd	$dat0,q9
	aesimc	$dat0,$dat0
569
	aesd	$dat1,q9
570
	aesimc	$dat1,$dat1
571
	aesd	$dat2,q9
572
	aesimc	$dat2,$dat2
573
	 veor	$tmp2,$in1,$rndlast
574 575 576
	 add	$inp,$inp,x6		// $inp is adjusted in such way that
					// at exit from the loop $dat1-$dat2
					// are loaded with last "words"
577 578 579
	 vorr	$ivec,$in2,$in2
	 mov	$key_,$key
	aesd	$dat0,q12
580
	aesimc	$dat0,$dat0
581
	aesd	$dat1,q12
582
	aesimc	$dat1,$dat1
583
	aesd	$dat2,q12
584 585
	aesimc	$dat2,$dat2
	 vld1.8	{$in0},[$inp],#16
586
	aesd	$dat0,q13
587
	aesimc	$dat0,$dat0
588
	aesd	$dat1,q13
589
	aesimc	$dat1,$dat1
590
	aesd	$dat2,q13
591 592
	aesimc	$dat2,$dat2
	 vld1.8	{$in1},[$inp],#16
593 594
	aesd	$dat0,q14
	aesimc	$dat0,$dat0
595
	aesd	$dat1,q14
596
	aesimc	$dat1,$dat1
597
	aesd	$dat2,q14
598
	aesimc	$dat2,$dat2
599
	 vld1.8	{$in2},[$inp],#16
600 601
	aesd	$dat0,q15
	aesd	$dat1,q15
602
	aesd	$dat2,q15
603
	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
604
	 add	$cnt,$rounds,#2
605 606
	veor	$tmp0,$tmp0,$dat0
	veor	$tmp1,$tmp1,$dat1
607 608
	veor	$dat2,$dat2,$tmp2
	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
A
Andy Polyakov 已提交
609
	vst1.8	{$tmp0},[$out],#16
610
	 vorr	$dat0,$in0,$in0
A
Andy Polyakov 已提交
611
	vst1.8	{$tmp1},[$out],#16
612
	 vorr	$dat1,$in1,$in1
613 614 615
	vst1.8	{$dat2},[$out],#16
	 vorr	$dat2,$in2,$in2
	b.hs	.Loop3x_cbc_dec
616

617
	cmn	$len,#0x30
618
	b.eq	.Lcbc_done
619
	nop
620 621

.Lcbc_dec_tail:
622 623
	aesd	$dat1,q8
	aesimc	$dat1,$dat1
624
	aesd	$dat2,q8
625
	aesimc	$dat2,$dat2
626
	vld1.32	{q8},[$key_],#16
627
	subs	$cnt,$cnt,#2
628 629
	aesd	$dat1,q9
	aesimc	$dat1,$dat1
630
	aesd	$dat2,q9
631
	aesimc	$dat2,$dat2
632
	vld1.32	{q9},[$key_],#16
633 634
	b.gt	.Lcbc_dec_tail

635 636
	aesd	$dat1,q8
	aesimc	$dat1,$dat1
637
	aesd	$dat2,q8
638 639 640
	aesimc	$dat2,$dat2
	aesd	$dat1,q9
	aesimc	$dat1,$dat1
641
	aesd	$dat2,q9
642 643 644
	aesimc	$dat2,$dat2
	aesd	$dat1,q12
	aesimc	$dat1,$dat1
645
	aesd	$dat2,q12
646 647 648 649
	aesimc	$dat2,$dat2
	 cmn	$len,#0x20
	aesd	$dat1,q13
	aesimc	$dat1,$dat1
650
	aesd	$dat2,q13
651 652 653 654
	aesimc	$dat2,$dat2
	 veor	$tmp1,$ivec,$rndlast
	aesd	$dat1,q14
	aesimc	$dat1,$dat1
655
	aesd	$dat2,q14
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
	aesimc	$dat2,$dat2
	 veor	$tmp2,$in1,$rndlast
	aesd	$dat1,q15
	aesd	$dat2,q15
	b.eq	.Lcbc_dec_one
	veor	$tmp1,$tmp1,$dat1
	veor	$tmp2,$tmp2,$dat2
	 vorr	$ivec,$in2,$in2
	vst1.8	{$tmp1},[$out],#16
	vst1.8	{$tmp2},[$out],#16
	b	.Lcbc_done

.Lcbc_dec_one:
	veor	$tmp1,$tmp1,$dat2
	 vorr	$ivec,$in2,$in2
	vst1.8	{$tmp1},[$out],#16
672 673 674 675 676

.Lcbc_done:
	vst1.8	{$ivec},[$ivp]
.Lcbc_abort:
___
677
}
678 679 680 681 682 683 684 685 686 687 688 689
$code.=<<___	if ($flavour !~ /64/);
	vldmia	sp!,{d8-d15}
	ldmia	sp!,{r4-r8,pc}
___
$code.=<<___	if ($flavour =~ /64/);
	ldr	x29,[sp],#16
	ret
___
$code.=<<___;
.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
690 691
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
692 693 694 695
my ($rounds,$cnt,$key_)=("w5","w6","x7");
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
my $step="x12";		# aliases with $tctr2

696
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
697
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722

my ($dat,$tmp)=($dat0,$tmp0);

### q8-q15	preloaded key schedule

$code.=<<___;
.globl	${prefix}_ctr32_encrypt_blocks
.type	${prefix}_ctr32_encrypt_blocks,%function
.align	5
${prefix}_ctr32_encrypt_blocks:
___
$code.=<<___	if ($flavour =~ /64/);
	stp		x29,x30,[sp,#-16]!
	add		x29,sp,#0
___
$code.=<<___	if ($flavour !~ /64/);
	mov		ip,sp
	stmdb		sp!,{r4-r10,lr}
	vstmdb		sp!,{d8-d15}            @ ABI specification says so
	ldr		r4, [ip]		@ load remaining arg
___
$code.=<<___;
	ldr		$rounds,[$key,#240]

	ldr		$ctr, [$ivp, #12]
H
HJ 已提交
723 724 725
#ifdef __ARMEB__
	vld1.8		{$dat0},[$ivp]
#else
726
	vld1.32		{$dat0},[$ivp]
H
HJ 已提交
727
#endif
728
	vld1.32		{q8-q9},[$key]		// load key schedule...
729 730 731 732
	sub		$rounds,$rounds,#4
	mov		$step,#16
	cmp		$len,#2
	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
733 734 735 736 737 738
	sub		$rounds,$rounds,#2
	vld1.32		{q12-q13},[$key_],#32
	vld1.32		{q14-q15},[$key_],#32
	vld1.32		{$rndlast},[$key_]
	add		$key_,$key,#32
	mov		$cnt,$rounds
739
	cclr		$step,lo
A
Andy Polyakov 已提交
740
#ifndef __ARMEB__
741 742
	rev		$ctr, $ctr
#endif
743
	add		$tctr1, $ctr, #1
744
	vorr		$ivec,$dat0,$dat0
745
	rev		$tctr1, $tctr1
H
HJ 已提交
746 747 748
	vmov.32		${ivec}[3],$tctr1
	add		$ctr, $ctr, #2
	vorr		$dat1,$ivec,$ivec
749 750
	b.ls		.Lctr32_tail
	rev		$tctr2, $ctr
H
HJ 已提交
751
	vmov.32		${ivec}[3],$tctr2
752
	sub		$len,$len,#3		// bias
H
HJ 已提交
753
	vorr		$dat2,$ivec,$ivec
754
	b		.Loop3x_ctr32
755

756 757
.align	4
.Loop3x_ctr32:
758 759
	aese		$dat0,q8
	aesmc		$dat0,$dat0
760
	aese		$dat1,q8
761
	aesmc		$dat1,$dat1
762
	aese		$dat2,q8
763
	aesmc		$dat2,$dat2
764
	vld1.32		{q8},[$key_],#16
765 766 767
	subs		$cnt,$cnt,#2
	aese		$dat0,q9
	aesmc		$dat0,$dat0
768
	aese		$dat1,q9
769
	aesmc		$dat1,$dat1
770
	aese		$dat2,q9
771
	aesmc		$dat2,$dat2
772
	vld1.32		{q9},[$key_],#16
773
	b.gt		.Loop3x_ctr32
774 775 776

	aese		$dat0,q8
	aesmc		$tmp0,$dat0
777
	aese		$dat1,q8
778
	aesmc		$tmp1,$dat1
779
	 vld1.8		{$in0},[$inp],#16
H
HJ 已提交
780
	 add		$tctr0,$ctr,#1
781 782
	aese		$dat2,q8
	aesmc		$dat2,$dat2
783
	 vld1.8		{$in1},[$inp],#16
H
HJ 已提交
784
	 rev		$tctr0,$tctr0
785
	aese		$tmp0,q9
786
	aesmc		$tmp0,$tmp0
787
	aese		$tmp1,q9
788
	aesmc		$tmp1,$tmp1
789 790 791
	 vld1.8		{$in2},[$inp],#16
	 mov		$key_,$key
	aese		$dat2,q9
792
	aesmc		$tmp2,$dat2
793
	aese		$tmp0,q12
794
	aesmc		$tmp0,$tmp0
795
	aese		$tmp1,q12
796
	aesmc		$tmp1,$tmp1
797 798
	 veor		$in0,$in0,$rndlast
	 add		$tctr1,$ctr,#2
799
	aese		$tmp2,q12
800 801 802
	aesmc		$tmp2,$tmp2
	 veor		$in1,$in1,$rndlast
	 add		$ctr,$ctr,#3
803
	aese		$tmp0,q13
804
	aesmc		$tmp0,$tmp0
805
	aese		$tmp1,q13
806
	aesmc		$tmp1,$tmp1
807
	 veor		$in2,$in2,$rndlast
H
HJ 已提交
808
	 vmov.32	${ivec}[3], $tctr0
809
	aese		$tmp2,q13
810
	aesmc		$tmp2,$tmp2
H
HJ 已提交
811
	 vorr		$dat0,$ivec,$ivec
812
	 rev		$tctr1,$tctr1
813
	aese		$tmp0,q14
814
	aesmc		$tmp0,$tmp0
H
HJ 已提交
815 816
	 vmov.32	${ivec}[3], $tctr1
	 rev		$tctr2,$ctr
817
	aese		$tmp1,q14
818
	aesmc		$tmp1,$tmp1
H
HJ 已提交
819 820
	 vorr		$dat1,$ivec,$ivec
	 vmov.32	${ivec}[3], $tctr2
821
	aese		$tmp2,q14
822
	aesmc		$tmp2,$tmp2
H
HJ 已提交
823
	 vorr		$dat2,$ivec,$ivec
824
	 subs		$len,$len,#3
825 826
	aese		$tmp0,q15
	aese		$tmp1,q15
827
	aese		$tmp2,q15
828 829

	veor		$in0,$in0,$tmp0
830 831
	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
	vst1.8		{$in0},[$out],#16
832
	veor		$in1,$in1,$tmp1
833 834
	 mov		$cnt,$rounds
	vst1.8		{$in1},[$out],#16
835 836 837 838
	veor		$in2,$in2,$tmp2
	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
	vst1.8		{$in2},[$out],#16
	b.hs		.Loop3x_ctr32
839

840
	adds		$len,$len,#3
841
	b.eq		.Lctr32_done
842 843 844
	cmp		$len,#1
	mov		$step,#16
	cclr		$step,eq
845

846
.Lctr32_tail:
847 848
	aese		$dat0,q8
	aesmc		$dat0,$dat0
849
	aese		$dat1,q8
850
	aesmc		$dat1,$dat1
851
	vld1.32		{q8},[$key_],#16
852
	subs		$cnt,$cnt,#2
853 854
	aese		$dat0,q9
	aesmc		$dat0,$dat0
855
	aese		$dat1,q9
856
	aesmc		$dat1,$dat1
857
	vld1.32		{q9},[$key_],#16
858 859 860
	b.gt		.Lctr32_tail

	aese		$dat0,q8
861
	aesmc		$dat0,$dat0
862
	aese		$dat1,q8
863
	aesmc		$dat1,$dat1
864
	aese		$dat0,q9
865
	aesmc		$dat0,$dat0
866
	aese		$dat1,q9
867
	aesmc		$dat1,$dat1
868
	 vld1.8		{$in0},[$inp],$step
869 870
	aese		$dat0,q12
	aesmc		$dat0,$dat0
871
	aese		$dat1,q12
872
	aesmc		$dat1,$dat1
873
	 vld1.8		{$in1},[$inp]
874 875
	aese		$dat0,q13
	aesmc		$dat0,$dat0
876
	aese		$dat1,q13
877
	aesmc		$dat1,$dat1
878
	 veor		$in0,$in0,$rndlast
879
	aese		$dat0,q14
880
	aesmc		$dat0,$dat0
881
	aese		$dat1,q14
882 883
	aesmc		$dat1,$dat1
	 veor		$in1,$in1,$rndlast
884
	aese		$dat0,q15
885 886
	aese		$dat1,q15

887
	cmp		$len,#1
888 889 890 891
	veor		$in0,$in0,$dat0
	veor		$in1,$in1,$dat1
	vst1.8		{$in0},[$out],#16
	b.eq		.Lctr32_done
892
	vst1.8		{$in1},[$out]
893 894 895 896 897 898 899 900 901 902 903 904 905 906 907

.Lctr32_done:
___
$code.=<<___	if ($flavour !~ /64/);
	vldmia		sp!,{d8-d15}
	ldmia		sp!,{r4-r10,pc}
___
$code.=<<___	if ($flavour =~ /64/);
	ldr		x29,[sp],#16
	ret
___
$code.=<<___;
.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
A
Andy Polyakov 已提交
908 909 910
$code.=<<___;
#endif
___
911 912 913 914 915 916
########################################
if ($flavour =~ /64/) {			######## 64-bit code
    my %opcode = (
	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);

A
Andy Polyakov 已提交
917
    local *unaes = sub {
918 919 920
	my ($mnemonic,$arg)=@_;

	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
A
Andy Polyakov 已提交
921
	sprintf ".inst\t0x%08x\t//%s %s",
922 923
			$opcode{$mnemonic}|$1|($2<<5),
			$mnemonic,$arg;
A
Andy Polyakov 已提交
924
    };
925 926

    foreach(split("\n",$code)) {
927
	s/\`([^\`]*)\`/eval($1)/geo;
928 929

	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
930
	s/@\s/\/\//o;			# old->new style commentary
931 932 933

	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
934 935 936 937 938 939 940
	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
	s/vext\.8/ext/o		or
	s/vrev32\.8/rev32/o	or
	s/vtst\.8/cmtst/o	or
	s/vshr/ushr/o		or
	s/^(\s+)v/$1/o		or	# strip off v prefix
941 942
	s/\bbx\s+lr\b/ret/o;

J
Josh Soref 已提交
943
	# fix up remaining legacy suffixes
944 945
	s/\.[ui]?8//o;
	m/\],#8/o and s/\.16b/\.8b/go;
946 947
	s/\.[ui]?32//o and s/\.16b/\.4s/go;
	s/\.[ui]?64//o and s/\.16b/\.2d/go;
948 949
	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;

950
	print $_,"\n";
951 952 953 954 955 956
    }
} else {				######## 32-bit code
    my %opcode = (
	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);

A
Andy Polyakov 已提交
957
    local *unaes = sub {
958 959
	my ($mnemonic,$arg)=@_;

A
Andy Polyakov 已提交
960 961 962 963 964 965 966 967 968
	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
					 |(($2&7)<<1) |(($2&8)<<2);
	    # since ARMv7 instructions are always encoded little-endian.
	    # correct solution is to use .inst directive, but older
	    # assemblers don't implement it:-(
	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
			$word&0xff,($word>>8)&0xff,
			($word>>16)&0xff,($word>>24)&0xff,
969
			$mnemonic,$arg;
A
Andy Polyakov 已提交
970 971
	}
    };
972 973 974 975 976

    sub unvtbl {
	my $arg=shift;

	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
A
Andy Polyakov 已提交
977
	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
978
		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
979 980 981 982 983 984
    }

    sub unvdup32 {
	my $arg=shift;

	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
985
	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
986 987
    }

988 989 990 991
    sub unvmov32 {
	my $arg=shift;

	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
992
	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
993 994
    }

995
    foreach(split("\n",$code)) {
996
	s/\`([^\`]*)\`/eval($1)/geo;
997 998 999

	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
1000
	s/\/\/\s?/@ /o;				# new->old style commentary
1001

J
Josh Soref 已提交
1002
	# fix up remaining new-style suffixes
A
Andy Polyakov 已提交
1003
	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
1004 1005 1006 1007 1008 1009
	s/\],#[0-9]+/]!/o;

	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1010
	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1011
	s/^(\s+)b\./$1b/o				or
1012
	s/^(\s+)mov\./$1mov/o				or
1013 1014
	s/^(\s+)ret/$1bx\tlr/o;

1015
	print $_,"\n";
1016 1017 1018
    }
}

1019
close STDOUT or die "error closing STDOUT: $!";