aesfx-sparcv9.pl 27.5 KB
Newer Older
R
Rich Salz 已提交
1 2 3 4 5 6 7 8
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

9 10 11 12 13 14 15 16 17 18 19 20
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# March 2016
#
# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
# required key setup and single-block procedures.
21 22 23 24 25 26 27 28
#
# April 2016
#
# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
# that parallelizeable nature of CBC decrypt and CTR is not utilized
# yet. CBC encrypt on the other hand is as good as it can possibly
# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
# This is ~6x faster than pure software implementation...
29 30 31 32 33 34
#
# July 2016
#
# Switch from faligndata to fshiftorx, which allows to omit alignaddr
# instructions and improve single-block and short-input performance
# with misaligned data.
35 36 37 38 39 40 41 42

$output = pop;
open STDOUT,">$output";

{
my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));

$code.=<<___;
43 44 45 46
#include "sparc_arch.h"

#define LOCALS (STACK_BIAS+STACK_FRAME)

47 48 49 50 51 52
.text

.globl	aes_fx_encrypt
.align	32
aes_fx_encrypt:
	and		$inp, 7, $tmp		! is input aligned?
53 54
	andn		$inp, 7, $inp
	ldd		[$key +  0], %f6	! round[0]
55
	ldd		[$key +  8], %f8
56 57
	mov		%o7, %g1
	ld		[$key + 240], $rounds
58

59 60 61 62
1:	call		.+8
	add		%o7, .Linp_align-1b, %o7

	sll		$tmp, 3, $tmp
63 64 65 66
	ldd		[$inp + 0], %f0		! load input
	brz,pt		$tmp, .Lenc_inp_aligned
	ldd		[$inp + 8], %f2

67
	ldd		[%o7 + $tmp], %f14	! shift left params
68
	ldd		[$inp + 16], %f4
69 70
	fshiftorx	%f0, %f2, %f14, %f0
	fshiftorx	%f2, %f4, %f14, %f2
71 72

.Lenc_inp_aligned:
73
	ldd		[$key + 16], %f10	! round[1]
74 75 76 77
	ldd		[$key + 24], %f12

	fxor		%f0, %f6, %f0		! ^=round[0]
	fxor		%f2, %f8, %f2
78 79 80
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8
	add		$key, 32, $key
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
	sub		$rounds, 4, $rounds

.Loop_enc:
	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$key + 16], %f10
	ldd		[$key + 24], %f12
	add		$key, 32, $key

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	ldd		[$key +  0], %f6
	ldd		[$key +  8], %f8

	brnz,a		$rounds, .Loop_enc
	sub		$rounds, 2, $rounds

	andcc		$out, 7, $tmp		! is output aligned?
101
	andn		$out, 7, $out
102
	mov		0xff, $mask
103 104 105
	srl		$mask, $tmp, $mask
	add		%o7, 64, %o7
	sll		$tmp, 3, $tmp
106 107 108 109

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
110 111
	ldd		[%o7 + $tmp], %f14	! shift right params

112 113 114 115
	fmovd		%f0, %f4
	faesenclx	%f2, %f6, %f0
	faesenclx	%f4, %f8, %f2

116 117
	bnz,pn		%icc, .Lenc_out_unaligned
	mov		%g1, %o7
118 119 120 121 122

	std		%f0, [$out + 0]
	retl
	std		%f2, [$out + 8]

123
.align	16
124
.Lenc_out_unaligned:
125 126 127 128 129
	add		$out, 16, $inp
	orn		%g0, $mask, $tmp
	fshiftorx	%f0, %f0, %f14, %f4
	fshiftorx	%f0, %f2, %f14, %f6
	fshiftorx	%f2, %f2, %f14, %f8
130 131 132

	stda		%f4, [$out + $mask]0xc0	! partial store
	std		%f6, [$out + 8]
133
	stda		%f8, [$inp + $tmp]0xc0	! partial store
134 135
	retl
	nop
136
.type	aes_fx_encrypt,#function
137 138 139 140 141 142
.size	aes_fx_encrypt,.-aes_fx_encrypt

.globl	aes_fx_decrypt
.align	32
aes_fx_decrypt:
	and		$inp, 7, $tmp		! is input aligned?
143 144
	andn		$inp, 7, $inp
	ldd		[$key +  0], %f6	! round[0]
145
	ldd		[$key +  8], %f8
146 147 148 149 150
	mov		%o7, %g1
	ld		[$key + 240], $rounds

1:	call		.+8
	add		%o7, .Linp_align-1b, %o7
151

152
	sll		$tmp, 3, $tmp
153 154 155 156
	ldd		[$inp + 0], %f0		! load input
	brz,pt		$tmp, .Ldec_inp_aligned
	ldd		[$inp + 8], %f2

157
	ldd		[%o7 + $tmp], %f14	! shift left params
158
	ldd		[$inp + 16], %f4
159 160
	fshiftorx	%f0, %f2, %f14, %f0
	fshiftorx	%f2, %f4, %f14, %f2
161 162

.Ldec_inp_aligned:
163
	ldd		[$key + 16], %f10	! round[1]
164 165 166 167
	ldd		[$key + 24], %f12

	fxor		%f0, %f6, %f0		! ^=round[0]
	fxor		%f2, %f8, %f2
168 169 170
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8
	add		$key, 32, $key
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
	sub		$rounds, 4, $rounds

.Loop_dec:
	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$key + 16], %f10
	ldd		[$key + 24], %f12
	add		$key, 32, $key

	fmovd		%f0, %f4
	faesdecx	%f2, %f6, %f0
	faesdecx	%f4, %f8, %f2
	ldd		[$key +  0], %f6
	ldd		[$key +  8], %f8

	brnz,a		$rounds, .Loop_dec
	sub		$rounds, 2, $rounds

	andcc		$out, 7, $tmp		! is output aligned?
191
	andn		$out, 7, $out
192
	mov		0xff, $mask
193 194 195
	srl		$mask, $tmp, $mask
	add		%o7, 64, %o7
	sll		$tmp, 3, $tmp
196 197 198 199

	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
200 201
	ldd		[%o7 + $tmp], %f14	! shift right params

202 203 204 205
	fmovd		%f0, %f4
	faesdeclx	%f2, %f6, %f0
	faesdeclx	%f4, %f8, %f2

206 207
	bnz,pn		%icc, .Ldec_out_unaligned
	mov		%g1, %o7
208 209 210 211 212

	std		%f0, [$out + 0]
	retl
	std		%f2, [$out + 8]

213
.align	16
214
.Ldec_out_unaligned:
215 216 217 218 219
	add		$out, 16, $inp
	orn		%g0, $mask, $tmp
	fshiftorx	%f0, %f0, %f14, %f4
	fshiftorx	%f0, %f2, %f14, %f6
	fshiftorx	%f2, %f2, %f14, %f8
220 221 222

	stda		%f4, [$out + $mask]0xc0	! partial store
	std		%f6, [$out + 8]
223
	stda		%f8, [$inp + $tmp]0xc0	! partial store
224 225
	retl
	nop
226
.type	aes_fx_decrypt,#function
227 228 229 230 231 232 233 234 235 236 237 238 239
.size	aes_fx_decrypt,.-aes_fx_decrypt
___
}
{
my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
$code.=<<___;
.globl	aes_fx_set_decrypt_key
.align	32
aes_fx_set_decrypt_key:
	b		.Lset_encrypt_key
	mov		-1, $inc
	retl
	nop
240
.type	aes_fx_set_decrypt_key,#function
241 242 243 244 245 246
.size	aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key

.globl	aes_fx_set_encrypt_key
.align	32
aes_fx_set_encrypt_key:
	mov		1, $inc
247
	nop
248 249
.Lset_encrypt_key:
	and		$inp, 7, $tmp
250
	andn		$inp, 7, $inp
251 252 253 254 255 256 257 258
	sll		$tmp, 3, $tmp
	mov		%o7, %g1

1:	call		.+8
	add		%o7, .Linp_align-1b, %o7

	ldd		[%o7 + $tmp], %f10	! shift left params
	mov		%g1, %o7
259 260 261 262 263 264 265 266 267 268 269 270

	cmp		$bits, 192
	ldd		[$inp + 0], %f0
	bl,pt		%icc, .L128
	ldd		[$inp + 8], %f2

	be,pt		%icc, .L192
	ldd		[$inp + 16], %f4
	brz,pt		$tmp, .L256aligned
	ldd		[$inp + 24], %f6

	ldd		[$inp + 32], %f8
271 272 273 274
	fshiftorx	%f0, %f2, %f10, %f0
	fshiftorx	%f2, %f4, %f10, %f2
	fshiftorx	%f4, %f6, %f10, %f4
	fshiftorx	%f6, %f8, %f10, %f6
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302

.L256aligned:
	mov		14, $bits
	and		$inc, `14*16`, $tmp
	st		$bits, [$out + 240]	! store rounds
	add		$out, $tmp, $out	! start or end of key schedule
	sllx		$inc, 4, $inc		! 16 or -16
___
for ($i=0; $i<6; $i++) {
    $code.=<<___;
	std		%f0, [$out + 0]
	faeskeyx	%f6, `0x10+$i`, %f0
	std		%f2, [$out + 8]
	add		$out, $inc, $out
	faeskeyx	%f0, 0x00, %f2
	std		%f4, [$out + 0]
	faeskeyx	%f2, 0x01, %f4
	std		%f6, [$out + 8]
	add		$out, $inc, $out
	faeskeyx	%f4, 0x00, %f6
___
}
$code.=<<___;
	std		%f0, [$out + 0]
	faeskeyx	%f6, `0x10+$i`, %f0
	std		%f2, [$out + 8]
	add		$out, $inc, $out
	faeskeyx	%f0, 0x00, %f2
303 304
	std		%f4,[$out + 0]
	std		%f6,[$out + 8]
305
	add		$out, $inc, $out
306 307
	std		%f0,[$out + 0]
	std		%f2,[$out + 8]
308 309 310 311 312 313 314 315 316
	retl
	xor		%o0, %o0, %o0		! return 0

.align	16
.L192:
	brz,pt		$tmp, .L192aligned
	nop

	ldd		[$inp + 24], %f6
317 318 319
	fshiftorx	%f0, %f2, %f10, %f0
	fshiftorx	%f2, %f4, %f10, %f2
	fshiftorx	%f4, %f6, %f10, %f4
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360

.L192aligned:
	mov		12, $bits
	and		$inc, `12*16`, $tmp
	st		$bits, [$out + 240]	! store rounds
	add		$out, $tmp, $out	! start or end of key schedule
	sllx		$inc, 4, $inc		! 16 or -16
___
for ($i=0; $i<8; $i+=2) {
    $code.=<<___;
	std		%f0, [$out + 0]
	faeskeyx	%f4, `0x10+$i`, %f0
	std		%f2, [$out + 8]
	add		$out, $inc, $out
	faeskeyx	%f0, 0x00, %f2
	std		%f4, [$out + 0]
	faeskeyx	%f2, 0x00, %f4
	std		%f0, [$out + 8]
	add		$out, $inc, $out
	faeskeyx	%f4, `0x10+$i+1`, %f0
	std		%f2, [$out + 0]
	faeskeyx	%f0, 0x00, %f2
	std		%f4, [$out + 8]
	add		$out, $inc, $out
___
$code.=<<___		if ($i<6);
	faeskeyx	%f2, 0x00, %f4
___
}
$code.=<<___;
	std		%f0, [$out + 0]
	std		%f2, [$out + 8]
	retl
	xor		%o0, %o0, %o0		! return 0

.align	16
.L128:
	brz,pt		$tmp, .L128aligned
	nop

	ldd		[$inp + 16], %f4
361 362
	fshiftorx	%f0, %f2, %f10, %f0
	fshiftorx	%f2, %f4, %f10, %f2
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384

.L128aligned:
	mov		10, $bits
	and		$inc, `10*16`, $tmp
	st		$bits, [$out + 240]	! store rounds
	add		$out, $tmp, $out	! start or end of key schedule
	sllx		$inc, 4, $inc		! 16 or -16
___
for ($i=0; $i<10; $i++) {
    $code.=<<___;
	std		%f0, [$out + 0]
	faeskeyx	%f2, `0x10+$i`, %f0
	std		%f2, [$out + 8]
	add		$out, $inc, $out
	faeskeyx	%f0, 0x00, %f2
___
}
$code.=<<___;
	std		%f0, [$out + 0]
	std		%f2, [$out + 8]
	retl
	xor		%o0, %o0, %o0		! return 0
385
.type	aes_fx_set_encrypt_key,#function
386 387 388
.size	aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
___
}
389 390 391
{
my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
392
my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
393 394 395 396 397 398 399 400
   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
my ($ileft,$iright) = ($ialign,$oalign);

$code.=<<___;
.globl	aes_fx_cbc_encrypt
.align	32
aes_fx_cbc_encrypt:
	save		%sp, -STACK_FRAME-16, %sp
401
	srln		$len, 4, $len
402 403
	and		$inp, 7, $ialign
	andn		$inp, 7, $inp
404 405 406 407 408 409
	brz,pn		$len, .Lcbc_no_data
	sll		$ialign, 3, $ileft

1:	call		.+8
	add		%o7, .Linp_align-1b, %o7

410 411 412
	ld		[$key + 240], $rounds
	and		$out, 7, $oalign
	ld		[$ivp + 0], %f0		! load ivec
413
	andn		$out, 7, $out
414
	ld		[$ivp + 4], %f1
415
	sll		$oalign, 3, $mask
416 417 418 419 420 421 422 423 424
	ld		[$ivp + 8], %f2
	ld		[$ivp + 12], %f3

	sll		$rounds, 4, $rounds
	add		$rounds, $key, $end
	ldd		[$key + 0], $r0hi	! round[0]
	ldd		[$key + 8], $r0lo

	add		$inp, 16, $inp
425
	sub		$len,  1, $len
426 427 428 429 430 431 432 433
	ldd		[$end + 0], $rlhi	! round[last]
	ldd		[$end + 8], $rllo

	mov		16, $inc
	movrz		$len, 0, $inc
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

434 435
	ldd		[%o7 + $ileft], $fshift	! shift left params
	add		%o7, 64, %o7
436 437 438 439 440 441 442 443
	ldd		[$inp - 16], $in0	! load input
	ldd		[$inp -  8], $in1
	ldda		[$inp]0x82, $intail	! non-faulting load
	brz		$dir, .Lcbc_decrypt
	add		$inp, $inc, $inp	! inp+=16

	fxor		$r0hi, %f0, %f0		! ivec^=round[0]
	fxor		$r0lo, %f2, %f2
444 445 446
	fshiftorx	$in0, $in1, $fshift, $in0
	fshiftorx	$in1, $intail, $fshift, $in1
	nop
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484

.Loop_cbc_enc:
	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
	fxor		$in1, %f2, %f2
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8
	add		$key, 32, $end
	sub		$rounds, 16*6, $inner

.Lcbc_enc:
	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10
	ldd		[$end + 24], %f12
	add		$end, 32, $end

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	ldd		[$end + 0], %f6
	ldd		[$end + 8], %f8

	brnz,a		$inner, .Lcbc_enc
	sub		$inner, 16*2, $inner

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10	! round[last-1]
	ldd		[$end + 24], %f12

	movrz		$len, 0, $inc
	fmovd		$intail, $in0
	ldd		[$inp - 8], $in1	! load next input block
	ldda		[$inp]0x82, $intail	! non-faulting load
	add		$inp, $inc, $inp	! inp+=16

485 486 487 488 489 490 491
	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2

	fshiftorx	$in0, $in1, $fshift, $in0
	fshiftorx	$in1, $intail, $fshift, $in1

492 493 494 495 496 497
	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

498 499
	fxor		$r0hi, $in0, $in0	! inp^=round[0]
	fxor		$r0lo, $in1, $in1
500 501

	fmovd		%f0, %f4
502
	faesenclx	%f2, $rlhi, %f0
503 504 505
	faesenclx	%f4, $rllo, %f2

	brnz,pn		$oalign, .Lcbc_enc_unaligned_out
506
	nop
507

508 509
	std		%f0, [$out + 0]
	std		%f2, [$out + 8]
510 511 512
	add		$out, 16, $out

	brnz,a		$len, .Loop_cbc_enc
513
	sub		$len, 1, $len
514

515 516 517 518
	st		%f0, [$ivp + 0]		! output ivec
	st		%f1, [$ivp + 4]
	st		%f2, [$ivp + 8]
	st		%f3, [$ivp + 12]
519 520 521 522 523 524 525

.Lcbc_no_data:
	ret
	restore

.align	32
.Lcbc_enc_unaligned_out:
526
	ldd		[%o7 + $mask], $fshift	! shift right params
527 528 529 530
	mov		0xff, $mask
	srl		$mask, $oalign, $mask
	sub		%g0, $ileft, $iright

531 532
	fshiftorx	%f0, %f0, $fshift, %f6
	fshiftorx	%f0, %f2, $fshift, %f8
533 534

	stda		%f6, [$out + $mask]0xc0	! partial store
535
	orn		%g0, $mask, $mask
536 537 538
	std		%f8, [$out + 8]
	add		$out, 16, $out
	brz		$len, .Lcbc_enc_unaligned_out_done
539 540 541
	sub		$len, 1, $len
	b		.Loop_cbc_enc_unaligned_out
	nop
542

543
.align	32
544
.Loop_cbc_enc_unaligned_out:
545
	fmovd		%f2, $outhead
546 547 548 549 550 551 552 553 554 555 556 557 558
	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
	fxor		$in1, %f2, %f2
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$key + 48], %f10	! round[3]
	ldd		[$key + 56], %f12

	ldx		[$inp - 16], %o0
	ldx		[$inp -  8], %o1
559
	brz		$ileft, .Lcbc_enc_aligned_inp
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
	movrz		$len, 0, $inc

	ldx		[$inp], %o2
	sllx		%o0, $ileft, %o0
	srlx		%o1, $iright, %g1
	sllx		%o1, $ileft, %o1
	or		%g1, %o0, %o0
	srlx		%o2, $iright, %o2
	or		%o2, %o1, %o1

.Lcbc_enc_aligned_inp:
	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	ldd		[$key + 64], %f6	! round[4]
	ldd		[$key + 72], %f8
	add		$key, 64, $end
	sub		$rounds, 16*8, $inner

	stx		%o0, [%sp + LOCALS + 0]
	stx		%o1, [%sp + LOCALS + 8]
	add		$inp, $inc, $inp	! inp+=16
582
	nop
583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609

.Lcbc_enc_unaligned:
	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10
	ldd		[$end + 24], %f12
	add		$end, 32, $end

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	ldd		[$end + 0], %f6
	ldd		[$end + 8], %f8

	brnz,a		$inner, .Lcbc_enc_unaligned
	sub		$inner, 16*2, $inner

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10	! round[last-1]
	ldd		[$end + 24], %f12

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
610

611 612 613 614 615 616 617 618 619
	ldd		[%sp + LOCALS + 0], $in0
	ldd		[%sp + LOCALS + 8], $in1

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

620 621 622
	fxor		$r0hi, $in0, $in0	! inp^=round[0]
	fxor		$r0lo, $in1, $in1

623
	fmovd		%f0, %f4
624
	faesenclx	%f2, $rlhi, %f0
625 626
	faesenclx	%f4, $rllo, %f2

627 628
	fshiftorx	$outhead, %f0, $fshift, %f6
	fshiftorx	%f0, %f2, $fshift, %f8
629 630 631 632 633
	std		%f6, [$out + 0]
	std		%f8, [$out + 8]
	add		$out, 16, $out

	brnz,a		$len, .Loop_cbc_enc_unaligned_out
634
	sub		$len, 1, $len
635 636

.Lcbc_enc_unaligned_out_done:
637
	fshiftorx	%f2, %f2, $fshift, %f8
638 639
	stda		%f8, [$out + $mask]0xc0	! partial store

640 641 642 643
	st		%f0, [$ivp + 0]		! output ivec
	st		%f1, [$ivp + 4]
	st		%f2, [$ivp + 8]
	st		%f3, [$ivp + 12]
644 645 646 647 648 649

	ret
	restore

.align	32
.Lcbc_decrypt:
650 651
	fshiftorx	$in0, $in1, $fshift, $in0
	fshiftorx	$in1, $intail, $fshift, $in1
652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
	fmovd		%f0, $iv0
	fmovd		%f2, $iv1

.Loop_cbc_dec:
	fxor		$in0, $r0hi, %f0	! inp^round[0]
	fxor		$in1, $r0lo, %f2
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8
	add		$key, 32, $end
	sub		$rounds, 16*6, $inner

.Lcbc_dec:
	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$end + 16], %f10
	ldd		[$end + 24], %f12
	add		$end, 32, $end

	fmovd		%f0, %f4
	faesdecx	%f2, %f6, %f0
	faesdecx	%f4, %f8, %f2
	ldd		[$end + 0], %f6
	ldd		[$end + 8], %f8

	brnz,a		$inner, .Lcbc_dec
	sub		$inner, 16*2, $inner

	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$end + 16], %f10	! round[last-1]
	ldd		[$end + 24], %f12

	fmovd		%f0, %f4
	faesdecx	%f2, %f6, %f0
	faesdecx	%f4, %f8, %f2
	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
	fxor		$iv1, $rllo, %f8
	fmovd		$in0, $iv0
	fmovd		$in1, $iv1

	movrz		$len, 0, $inc
	fmovd		$intail, $in0
	ldd		[$inp - 8], $in1	! load next input block
	ldda		[$inp]0x82, $intail	! non-faulting load
	add		$inp, $inc, $inp	! inp+=16

	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

706 707
	fshiftorx	$in0, $in1, $fshift, $in0
	fshiftorx	$in1, $intail, $fshift, $in1
708 709 710 711 712 713 714 715 716 717 718 719 720

	fmovd		%f0, %f4
	faesdeclx	%f2, %f6, %f0
	faesdeclx	%f4, %f8, %f2

	brnz,pn		$oalign, .Lcbc_dec_unaligned_out
	nop

	std		%f0, [$out + 0]
	std		%f2, [$out + 8]
	add		$out, 16, $out

	brnz,a		$len, .Loop_cbc_dec
721
	sub		$len, 1, $len
722 723 724 725 726 727 728 729 730 731 732

	st		$iv0,    [$ivp + 0]	! output ivec
	st		$iv0#lo, [$ivp + 4]
	st		$iv1,    [$ivp + 8]
	st		$iv1#lo, [$ivp + 12]

	ret
	restore

.align	32
.Lcbc_dec_unaligned_out:
733
	ldd		[%o7 + $mask], $fshift	! shift right params
734 735 736 737
	mov		0xff, $mask
	srl		$mask, $oalign, $mask
	sub		%g0, $ileft, $iright

738 739
	fshiftorx	%f0, %f0, $fshift, %f6
	fshiftorx	%f0, %f2, $fshift, %f8
740

741 742 743
	stda		%f6, [$out + $mask]0xc0	! partial store
	orn		%g0, $mask, $mask
	std		%f8, [$out + 8]
744 745
	add		$out, 16, $out
	brz		$len, .Lcbc_dec_unaligned_out_done
746 747 748
	sub		$len, 1, $len
	b		.Loop_cbc_dec_unaligned_out
	nop
749

750
.align	32
751 752 753 754 755 756 757 758 759 760 761 762 763 764 765
.Loop_cbc_dec_unaligned_out:
	fmovd		%f2, $outhead
	fxor		$in0, $r0hi, %f0	! inp^round[0]
	fxor		$in1, $r0lo, %f2
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8

	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$key + 48], %f10	! round[3]
	ldd		[$key + 56], %f12

	ldx		[$inp - 16], %o0
	ldx		[$inp - 8], %o1
766
	brz		$ileft, .Lcbc_dec_aligned_inp
767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
	movrz		$len, 0, $inc

	ldx		[$inp], %o2
	sllx		%o0, $ileft, %o0
	srlx		%o1, $iright, %g1
	sllx		%o1, $ileft, %o1
	or		%g1, %o0, %o0
	srlx		%o2, $iright, %o2
	or		%o2, %o1, %o1

.Lcbc_dec_aligned_inp:
	fmovd		%f0, %f4
	faesdecx	%f2, %f6, %f0
	faesdecx	%f4, %f8, %f2
	ldd		[$key + 64], %f6	! round[4]
	ldd		[$key + 72], %f8
	add		$key, 64, $end
	sub		$rounds, 16*8, $inner

	stx		%o0, [%sp + LOCALS + 0]
	stx		%o1, [%sp + LOCALS + 8]
	add		$inp, $inc, $inp	! inp+=16
789
	nop
790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816

.Lcbc_dec_unaligned:
	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$end + 16], %f10
	ldd		[$end + 24], %f12
	add		$end, 32, $end

	fmovd		%f0, %f4
	faesdecx	%f2, %f6, %f0
	faesdecx	%f4, %f8, %f2
	ldd		[$end + 0], %f6
	ldd		[$end + 8], %f8

	brnz,a		$inner, .Lcbc_dec_unaligned
	sub		$inner, 16*2, $inner

	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$end + 16], %f10	! round[last-1]
	ldd		[$end + 24], %f12

	fmovd		%f0, %f4
	faesdecx	%f2, %f6, %f0
	faesdecx	%f4, %f8, %f2
817

818 819 820 821
	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
	fxor		$iv1, $rllo, %f8
	fmovd		$in0, $iv0
	fmovd		$in1, $iv1
822 823
	ldd		[%sp + LOCALS + 0], $in0
	ldd		[%sp + LOCALS + 8], $in1
824 825 826 827 828 829 830 831 832 833 834

	fmovd		%f0, %f4
	faesdecx	%f2, %f10, %f0
	faesdecx	%f4, %f12, %f2
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

	fmovd		%f0, %f4
	faesdeclx	%f2, %f6, %f0
	faesdeclx	%f4, %f8, %f2

835 836 837 838
	fshiftorx	$outhead, %f0, $fshift, %f6
	fshiftorx	%f0, %f2, $fshift, %f8
	std		%f6, [$out + 0]
	std		%f8, [$out + 8]
839 840 841
	add		$out, 16, $out

	brnz,a		$len, .Loop_cbc_dec_unaligned_out
842
	sub		$len, 1, $len
843 844

.Lcbc_dec_unaligned_out_done:
845
	fshiftorx	%f2, %f2, $fshift, %f8
846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
	stda		%f8, [$out + $mask]0xc0	! partial store

	st		$iv0,    [$ivp + 0]	! output ivec
	st		$iv0#lo, [$ivp + 4]
	st		$iv1,    [$ivp + 8]
	st		$iv1#lo, [$ivp + 12]

	ret
	restore
.type	aes_fx_cbc_encrypt,#function
.size	aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
___
}
{
my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
862
my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
863 864 865 866 867 868 869 870 871
   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
my ($ileft,$iright) = ($ialign, $oalign);
my $one = "%f14";

$code.=<<___;
.globl	aes_fx_ctr32_encrypt_blocks
.align	32
aes_fx_ctr32_encrypt_blocks:
	save		%sp, -STACK_FRAME-16, %sp
872 873
	srln		$len, 0, $len
	and		$inp, 7, $ialign
874
	andn		$inp, 7, $inp
875 876
	brz,pn		$len, .Lctr32_no_data
	sll		$ialign, 3, $ileft
877 878

.Lpic:	call		.+8
879
	add		%o7, .Linp_align - .Lpic, %o7
880 881 882 883

	ld		[$key + 240], $rounds
	and		$out, 7, $oalign
	ld		[$ivp +  0], $ctr0	! load counter
884
	andn		$out, 7, $out
885
	ld		[$ivp +  4], $ctr0#lo
886
	sll		$oalign, 3, $mask
887 888
	ld		[$ivp +  8], $ctr1
	ld		[$ivp + 12], $ctr1#lo
889
	ldd		[%o7 + 128], $one
890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905

	sll		$rounds, 4, $rounds
	add		$rounds, $key, $end
	ldd		[$key + 0], $r0hi	! round[0]
	ldd		[$key + 8], $r0lo

	add		$inp, 16, $inp
	sub		$len, 1, $len
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

	mov		16, $inc
	movrz		$len, 0, $inc
	ldd		[$end + 0], $rlhi	! round[last]
	ldd		[$end + 8], $rllo

906 907
	ldd		[%o7 + $ileft], $fshift	! shiftleft params
	add		%o7, 64, %o7
908 909 910 911 912
	ldd		[$inp - 16], $in0	! load input
	ldd		[$inp -  8], $in1
	ldda		[$inp]0x82, $intail	! non-faulting load
	add		$inp, $inc, $inp	! inp+=16

913 914
	fshiftorx	$in0, $in1, $fshift, $in0
	fshiftorx	$in1, $intail, $fshift, $in1
915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964

.Loop_ctr32:
	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
	fxor		$ctr1, $r0lo, %f2
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8
	add		$key, 32, $end
	sub		$rounds, 16*6, $inner

.Lctr32_enc:
	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10
	ldd		[$end + 24], %f12
	add		$end, 32, $end

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	ldd		[$end + 0], %f6
	ldd		[$end + 8], %f8

	brnz,a		$inner, .Lctr32_enc
	sub		$inner, 16*2, $inner

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10	! round[last-1]
	ldd		[$end + 24], %f12

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	fxor		$in0, $rlhi, %f6	! inp^round[last]
	fxor		$in1, $rllo, %f8

	movrz		$len, 0, $inc
	fmovd		$intail, $in0
	ldd		[$inp - 8], $in1	! load next input block
	ldda		[$inp]0x82, $intail	! non-faulting load
	add		$inp, $inc, $inp	! inp+=16

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

965 966
	fshiftorx	$in0, $in1, $fshift, $in0
	fshiftorx	$in1, $intail, $fshift, $in1
967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988
	fpadd32		$ctr1, $one, $ctr1	! increment counter

	fmovd		%f0, %f4
	faesenclx	%f2, %f6, %f0
	faesenclx	%f4, %f8, %f2

	brnz,pn		$oalign, .Lctr32_unaligned_out
	nop

	std		%f0, [$out + 0]
	std		%f2, [$out + 8]
	add		$out, 16, $out

	brnz,a		$len, .Loop_ctr32
	sub		$len, 1, $len

.Lctr32_no_data:
	ret
	restore

.align	32
.Lctr32_unaligned_out:
989
	ldd		[%o7 + $mask], $fshift	! shift right params
990 991 992 993
	mov		0xff, $mask
	srl		$mask, $oalign, $mask
	sub		%g0, $ileft, $iright

994 995
	fshiftorx	%f0, %f0, $fshift, %f6
	fshiftorx	%f0, %f2, $fshift, %f8
996

997 998 999
	stda		%f6, [$out + $mask]0xc0	! partial store
	orn		%g0, $mask, $mask
	std		%f8, [$out + 8]
1000 1001
	add		$out, 16, $out
	brz		$len, .Lctr32_unaligned_out_done
1002 1003 1004
	sub		$len, 1, $len
	b		.Loop_ctr32_unaligned_out
	nop
1005

1006
.align	32
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
.Loop_ctr32_unaligned_out:
	fmovd		%f2, $outhead
	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
	fxor		$ctr1, $r0lo, %f2
	ldd		[$key + 32], %f6	! round[2]
	ldd		[$key + 40], %f8

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$key + 48], %f10	! round[3]
	ldd		[$key + 56], %f12

	ldx		[$inp - 16], %o0
	ldx		[$inp -  8], %o1
1022
	brz		$ileft, .Lctr32_aligned_inp
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
	movrz		$len, 0, $inc

	ldx		[$inp], %o2
	sllx		%o0, $ileft, %o0
	srlx		%o1, $iright, %g1
	sllx		%o1, $ileft, %o1
	or		%g1, %o0, %o0
	srlx		%o2, $iright, %o2
	or		%o2, %o1, %o1

.Lctr32_aligned_inp:
	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	ldd		[$key + 64], %f6	! round[4]
	ldd		[$key + 72], %f8
	add		$key, 64, $end
	sub		$rounds, 16*8, $inner

	stx		%o0, [%sp + LOCALS + 0]
	stx		%o1, [%sp + LOCALS + 8]
	add		$inp, $inc, $inp	! inp+=16
1045
	nop
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088

.Lctr32_enc_unaligned:
	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10
	ldd		[$end + 24], %f12
	add		$end, 32, $end

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	ldd		[$end + 0], %f6
	ldd		[$end + 8], %f8

	brnz,a		$inner, .Lctr32_enc_unaligned
	sub		$inner, 16*2, $inner

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$end + 16], %f10	! round[last-1]
	ldd		[$end + 24], %f12
	fpadd32		$ctr1, $one, $ctr1	! increment counter

	fmovd		%f0, %f4
	faesencx	%f2, %f6, %f0
	faesencx	%f4, %f8, %f2
	fxor		$in0, $rlhi, %f6	! inp^round[last]
	fxor		$in1, $rllo, %f8
	ldd		[%sp + LOCALS + 0], $in0
	ldd		[%sp + LOCALS + 8], $in1

	fmovd		%f0, %f4
	faesencx	%f2, %f10, %f0
	faesencx	%f4, %f12, %f2
	ldd		[$key + 16], %f10	! round[1]
	ldd		[$key + 24], %f12

	fmovd		%f0, %f4
	faesenclx	%f2, %f6, %f0
	faesenclx	%f4, %f8, %f2

1089 1090 1091 1092
	fshiftorx	$outhead, %f0, $fshift, %f6
	fshiftorx	%f0, %f2, $fshift, %f8
	std		%f6, [$out + 0]
	std		%f8, [$out + 8]
1093 1094 1095 1096 1097 1098
	add		$out, 16, $out

	brnz,a		$len, .Loop_ctr32_unaligned_out
	sub		$len, 1, $len

.Lctr32_unaligned_out_done:
1099
	fshiftorx	%f2, %f2, $fshift, %f8
1100 1101 1102 1103 1104 1105
	stda		%f8, [$out + $mask]0xc0	! partial store

	ret
	restore
.type	aes_fx_ctr32_encrypt_blocks,#function
.size	aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1106

1107
.align	32
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125
.Linp_align:		! fshiftorx parameters for left shift toward %rs1
	.byte	0, 0, 64,  0,	0, 64,  0, -64
	.byte	0, 0, 56,  8,	0, 56,  8, -56
	.byte	0, 0, 48, 16,	0, 48, 16, -48
	.byte	0, 0, 40, 24,	0, 40, 24, -40
	.byte	0, 0, 32, 32,	0, 32, 32, -32
	.byte	0, 0, 24, 40,	0, 24, 40, -24
	.byte	0, 0, 16, 48,	0, 16, 48, -16
	.byte	0, 0,  8, 56,	0,  8, 56, -8
.Lout_align:		! fshiftorx parameters for right shift toward %rs2
	.byte	0, 0,  0, 64,	0,  0, 64,   0
	.byte	0, 0,  8, 56,	0,  8, 56,  -8
	.byte	0, 0, 16, 48,	0, 16, 48, -16
	.byte	0, 0, 24, 40,	0, 24, 40, -24
	.byte	0, 0, 32, 32,	0, 32, 32, -32
	.byte	0, 0, 40, 24,	0, 40, 24, -40
	.byte	0, 0, 48, 16,	0, 48, 16, -48
	.byte	0, 0, 56,  8,	0, 56,  8, -56
1126 1127 1128 1129 1130 1131
.Lone:
	.word	0, 1
.asciz	"AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
.align	4
___
}
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my ($ref,$opf);
my %visopf = (	"faligndata"	=> 0x048,
		"bshuffle"	=> 0x04c,
1142
		"fpadd32"	=> 0x052,
1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
		"fxor"		=> 0x06c,
		"fsrc2"		=> 0x078	);

    $ref = "$mnemonic\t$rs1,$rs2,$rd";

    if ($opf=$visopf{$mnemonic}) {
	foreach ($rs1,$rs2,$rd) {
	    return $ref if (!/%f([0-9]{1,2})/);
	    $_=$1;
	    if ($1>=32) {
		return $ref if ($1&1);
		# re-encode for upper double register addressing
		$_=($1|$1>>5)&31;
	    }
	}

	return	sprintf ".word\t0x%08x !%s",
			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
			$ref;
    } else {
	return $ref;
    }
}

sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = (	"alignaddr"	=> 0x018,
		"bmask"		=> 0x019,
		"alignaddrl"	=> 0x01a	);

    $ref = "$mnemonic\t$rs1,$rs2,$rd";

    if ($opf=$visopf{$mnemonic}) {
	foreach ($rs1,$rs2,$rd) {
	    return $ref if (!/%([goli])([0-9])/);
	    $_=$bias{$1}+$2;
	}

	return	sprintf ".word\t0x%08x !%s",
			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
			$ref;
    } else {
	return $ref;
    }
}

sub unfx {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my ($ref,$opf);
my %aesopf = (	"faesencx"	=> 0x90,
		"faesdecx"	=> 0x91,
		"faesenclx"	=> 0x92,
		"faesdeclx"	=> 0x93,
		"faeskeyx"	=> 0x94	);

    $ref = "$mnemonic\t$rs1,$rs2,$rd";

    if (defined($opf=$aesopf{$mnemonic})) {
	$rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
	$rs2 = oct($rs2) if ($rs2 =~ /^0/);

	foreach ($rs1,$rd) {
	    return $ref if (!/%f([0-9]{1,2})/);
	    $_=$1;
	    if ($1>=32) {
		return $ref if ($1&1);
		# re-encode for upper double register addressing
		$_=($1|$1>>5)&31;
	    }
	}

	return	sprintf ".word\t0x%08x !%s",
			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
			$ref;
    } else {
	return $ref;
    }
}

1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
sub unfx3src {
my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
my ($ref,$opf);
my %aesopf = (	"fshiftorx"	=> 0x0b	);

    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";

    if (defined($opf=$aesopf{$mnemonic})) {
	foreach ($rs1,$rs2,$rs3,$rd) {
	    return $ref if (!/%f([0-9]{1,2})/);
	    $_=$1;
	    if ($1>=32) {
		return $ref if ($1&1);
		# re-encode for upper double register addressing
		$_=($1|$1>>5)&31;
	    }
	}

	return	sprintf ".word\t0x%08x !%s",
			2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
			$ref;
    } else {
	return $ref;
    }
}

1250 1251 1252
foreach (split("\n",$code)) {
    s/\`([^\`]*)\`/eval $1/ge;

1253 1254
    s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;

1255
    s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1256 1257 1258 1259
		&unfx($1,$2,$3,$4)
     /ge or
    s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
		&unfx3src($1,$2,$3,$4,$5)
1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270
     /ge or
    s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
		&unvis($1,$2,$3,$4)
     /ge or
    s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
		&unvis3($1,$2,$3,$4)
     /ge;
    print $_,"\n";
}

close STDOUT;