aesni-intel_asm.S 68.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 * Implement AES algorithm in Intel AES-NI instructions.
 *
 * The white paper of AES-NI instructions can be downloaded from:
 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
 *
 * Copyright (C) 2008, Intel Corp.
 *    Author: Huang Ying <ying.huang@intel.com>
 *            Vinodh Gopal <vinodh.gopal@intel.com>
 *            Kahraman Akdemir
 *
12 13 14 15 16 17 18 19 20 21 22
 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
 * interface for 64-bit kernels.
 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
 *             Adrian Hoban <adrian.hoban@intel.com>
 *             James Guilford (james.guilford@intel.com)
 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
 *             Tadeusz Struk (tadeusz.struk@intel.com)
 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
 *    Copyright (c) 2010, Intel Corporation.
 *
23 24 25
 * Ported x86_64 version to x86:
 *    Author: Mathias Krause <minipli@googlemail.com>
 *
26 27 28 29 30 31 32
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */

#include <linux/linkage.h>
33
#include <asm/inst.h>
34
#include <asm/frame.h>
35
#include <asm/nospec-branch.h>
36

37 38 39 40 41 42 43 44 45 46 47
/*
 * The following macros are used to move an (un)aligned 16 byte value to/from
 * an XMM register.  This can done for either FP or integer values, for FP use
 * movaps (move aligned packed single) or integer use movdqa (move double quad
 * aligned).  It doesn't make a performance difference which instruction is used
 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
 * shorter, so that is the one we'll use for now. (same for unaligned).
 */
#define MOVADQ	movaps
#define MOVUDQ	movups

48
#ifdef __x86_64__
49

50 51
# constants in mergeable sections, linker can reorder and merge
.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52 53 54
.align 16
.Lgf128mul_x_ble_mask:
	.octa 0x00000000000000010000000000000087
55 56
.section	.rodata.cst16.POLY, "aM", @progbits, 16
.align 16
57
POLY:   .octa 0xC2000000000000000000000000000001
58 59
.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
.align 16
60 61
TWOONE: .octa 0x00000001000000000000000000000001

62 63
.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
.align 16
64
SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
65 66
.section	.rodata.cst16.MASK1, "aM", @progbits, 16
.align 16
67
MASK1:      .octa 0x0000000000000000ffffffffffffffff
68 69
.section	.rodata.cst16.MASK2, "aM", @progbits, 16
.align 16
70
MASK2:      .octa 0xffffffffffffffff0000000000000000
71 72
.section	.rodata.cst16.ONE, "aM", @progbits, 16
.align 16
73
ONE:        .octa 0x00000000000000000000000000000001
74 75
.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
.align 16
76
F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 78
.section	.rodata.cst16.dec, "aM", @progbits, 16
.align 16
79
dec:        .octa 0x1
80 81
.section	.rodata.cst16.enc, "aM", @progbits, 16
.align 16
82 83
enc:        .octa 0x2

84 85 86 87 88 89 90 91 92
# order of these constants should not change.
# more specifically, ALL_F should follow SHIFT_MASK,
# and zero should follow ALL_F
.section	.rodata, "a", @progbits
.align 16
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
            .octa 0x00000000000000000000000000000000

93 94
.text

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114

#define	STACK_OFFSET    8*3
#define	HashKey		16*0	// store HashKey <<1 mod poly here
#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
				// bits of  HashKey <<1 mod poly here
				//(for Karatsuba purposes)
#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^2 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^3 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^4 <<1 mod poly here
				// (for Karatsuba purposes)
#define	VARIABLE_OFFSET	16*8

115 116 117 118 119 120 121 122
#define AadHash 16*0
#define AadLen 16*1
#define InLen (16*1)+8
#define PBlockEncKey 16*2
#define OrigIV 16*3
#define CurCount 16*4
#define PBlockLen 16*5

123 124 125 126 127 128 129 130 131 132
#define arg1 rdi
#define arg2 rsi
#define arg3 rdx
#define arg4 rcx
#define arg5 r8
#define arg6 r9
#define arg7 STACK_OFFSET+8(%r14)
#define arg8 STACK_OFFSET+16(%r14)
#define arg9 STACK_OFFSET+24(%r14)
#define arg10 STACK_OFFSET+32(%r14)
133
#define arg11 STACK_OFFSET+40(%r14)
134
#define keysize 2*15*16(%arg1)
135
#endif
136 137


138 139 140 141 142 143 144 145 146 147 148 149
#define STATE1	%xmm0
#define STATE2	%xmm4
#define STATE3	%xmm5
#define STATE4	%xmm6
#define STATE	STATE1
#define IN1	%xmm1
#define IN2	%xmm7
#define IN3	%xmm8
#define IN4	%xmm9
#define IN	IN1
#define KEY	%xmm2
#define IV	%xmm3
150

151 152 153
#define BSWAP_MASK %xmm10
#define CTR	%xmm11
#define INC	%xmm12
154

155 156
#define GF128MUL_MASK %xmm10

157 158
#ifdef __x86_64__
#define AREG	%rax
159 160
#define KEYP	%rdi
#define OUTP	%rsi
161
#define UKEYP	OUTP
162 163 164 165 166 167 168
#define INP	%rdx
#define LEN	%rcx
#define IVP	%r8
#define KLEN	%r9d
#define T1	%r10
#define TKEYP	T1
#define T2	%r11
169
#define TCTR_LOW T2
170 171 172 173 174 175 176 177 178 179 180 181
#else
#define AREG	%eax
#define KEYP	%edi
#define OUTP	AREG
#define UKEYP	OUTP
#define INP	%edx
#define LEN	%esi
#define IVP	%ebp
#define KLEN	%ebx
#define T1	%ecx
#define TKEYP	T1
#endif
182

183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
.macro FUNC_SAVE
	push	%r12
	push	%r13
	push	%r14
	mov	%rsp, %r14
#
# states of %xmm registers %xmm6:%xmm15 not saved
# all %xmm registers are clobbered
#
	sub	$VARIABLE_OFFSET, %rsp
	and	$~63, %rsp
.endm


.macro FUNC_RESTORE
	mov	%r14, %rsp
	pop	%r14
	pop	%r13
	pop	%r12
.endm
203

D
Dave Watson 已提交
204 205 206 207

# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
.macro GCM_INIT
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222

	mov arg9, %r11
	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
	xor %r11, %r11
	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
	mov %arg6, %rax
	movdqu (%rax), %xmm0
	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv

	movdqa  SHUF_MASK(%rip), %xmm2
	PSHUFB_XMM %xmm2, %xmm0
	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv

223
	mov	arg7, %r12
D
Dave Watson 已提交
224
	movdqu	(%r12), %xmm13
225
	movdqa	SHUF_MASK(%rip), %xmm2
D
Dave Watson 已提交
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
	PSHUFB_XMM %xmm2, %xmm13

	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)

	movdqa	%xmm13, %xmm2
	psllq	$1, %xmm13
	psrlq	$63, %xmm2
	movdqa	%xmm2, %xmm1
	pslldq	$8, %xmm2
	psrldq	$8, %xmm1
	por	%xmm2, %xmm13

	# reduce HashKey<<1

	pshufd	$0x24, %xmm1, %xmm2
	pcmpeqd TWOONE(%rip), %xmm2
	pand	POLY(%rip), %xmm2
	pxor	%xmm2, %xmm13
	movdqa	%xmm13, HashKey(%rsp)
245 246 247

	CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
	%xmm5 %xmm6
D
Dave Watson 已提交
248 249
.endm

250 251 252 253 254
# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
# struct has been initialized by GCM_INIT.
# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
# Clobbers rax, r10-r13, and xmm0-xmm15
.macro GCM_ENC_DEC operation
255 256 257 258 259 260
	movdqu AadHash(%arg2), %xmm8
	movdqu HashKey(%rsp), %xmm13
	add %arg5, InLen(%arg2)
	mov %arg5, %r13		# save the number of bytes
	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
	mov %r13, %r12
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
	# Encrypt/Decrypt first few blocks

	and	$(3<<4), %r12
	jz	_initial_num_blocks_is_0_\@
	cmp	$(2<<4), %r12
	jb	_initial_num_blocks_is_1_\@
	je	_initial_num_blocks_is_2_\@
_initial_num_blocks_is_3_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
	sub	$48, %r13
	jmp	_initial_blocks_\@
_initial_num_blocks_is_2_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
	sub	$32, %r13
	jmp	_initial_blocks_\@
_initial_num_blocks_is_1_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
	sub	$16, %r13
	jmp	_initial_blocks_\@
_initial_num_blocks_is_0_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
_initial_blocks_\@:

	# Main loop - Encrypt/Decrypt remaining blocks

	cmp	$0, %r13
	je	_zero_cipher_left_\@
	sub	$64, %r13
	je	_four_cipher_left_\@
_crypt_by_4_\@:
	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
	%xmm7, %xmm8, enc
	add	$64, %r11
	sub	$64, %r13
	jne	_crypt_by_4_\@
_four_cipher_left_\@:
	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
_zero_cipher_left_\@:
305 306 307
	movdqu %xmm8, AadHash(%arg2)
	movdqu %xmm0, CurCount(%arg2)

308 309
	mov	%arg5, %r13
	and	$15, %r13			# %r13 = arg5 (mod 16)
310 311
	je	_multiple_of_16_bytes_\@

312 313
	mov %r13, PBlockLen(%arg2)

314 315
	# Handle the last <16 Byte block separately
	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
316
	movdqu %xmm0, CurCount(%arg2)
317
	movdqa SHUF_MASK(%rip), %xmm10
318 319 320
	PSHUFB_XMM %xmm10, %xmm0

	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
321
	movdqu %xmm0, PBlockEncKey(%arg2)
322

323
	lea (%arg4,%r11,1), %r10
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
	mov %r13, %r12
	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1

	lea ALL_F+16(%rip), %r12
	sub %r13, %r12
.ifc \operation, dec
	movdqa  %xmm1, %xmm2
.endif
	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
	movdqu	(%r12), %xmm1
	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
.ifc \operation, dec
	pand    %xmm1, %xmm2
	movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10 ,%xmm2

	pxor %xmm2, %xmm8
.else
	movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10,%xmm0

	pxor	%xmm0, %xmm8
.endif

349
	movdqu %xmm8, AadHash(%arg2)
350 351 352 353 354 355 356 357 358 359 360
.ifc \operation, enc
	# GHASH computation for the last <16 byte block
	movdqa SHUF_MASK(%rip), %xmm10
	# shuffle xmm0 back to output as ciphertext
	PSHUFB_XMM %xmm10, %xmm0
.endif

	# Output %r13 bytes
	MOVQ_R64_XMM %xmm0, %rax
	cmp $8, %r13
	jle _less_than_8_bytes_left_\@
361
	mov %rax, (%arg3 , %r11, 1)
362 363 364 365 366
	add $8, %r11
	psrldq $8, %xmm0
	MOVQ_R64_XMM %xmm0, %rax
	sub $8, %r13
_less_than_8_bytes_left_\@:
367
	mov %al,  (%arg3, %r11, 1)
368 369 370 371 372 373 374
	add $1, %r11
	shr $8, %rax
	sub $1, %r13
	jne _less_than_8_bytes_left_\@
_multiple_of_16_bytes_\@:
.endm

375 376 377 378
# GCM_COMPLETE Finishes update of tag of last partial block
# Output: Authorization Tag (AUTH_TAG)
# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
.macro GCM_COMPLETE
379 380
	movdqu AadHash(%arg2), %xmm8
	movdqu HashKey(%rsp), %xmm13
381 382 383 384 385 386 387 388 389

	mov PBlockLen(%arg2), %r12

	cmp $0, %r12
	je _partial_done\@

	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6

_partial_done\@:
390
	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
391 392
	shl	$3, %r12		  # convert into number of bits
	movd	%r12d, %xmm15		  # len(A) in %xmm15
393 394 395 396
	mov InLen(%arg2), %r12
	shl     $3, %r12                  # len(C) in bits (*128)
	MOVQ_R64_XMM    %r12, %xmm1

397 398 399 400 401 402 403 404
	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
	pxor	%xmm15, %xmm8
	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	# final GHASH computation
	movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10, %xmm8

405
	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
406 407 408
	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
	pxor	%xmm8, %xmm0
_return_T_\@:
409 410
	mov	arg10, %r10                     # %r10 = authTag
	mov	arg11, %r11                    # %r11 = auth_tag_len
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
	cmp	$16, %r11
	je	_T_16_\@
	cmp	$8, %r11
	jl	_T_4_\@
_T_8_\@:
	MOVQ_R64_XMM	%xmm0, %rax
	mov	%rax, (%r10)
	add	$8, %r10
	sub	$8, %r11
	psrldq	$8, %xmm0
	cmp	$0, %r11
	je	_return_T_done_\@
_T_4_\@:
	movd	%xmm0, %eax
	mov	%eax, (%r10)
	add	$4, %r10
	sub	$4, %r11
	psrldq	$4, %xmm0
	cmp	$0, %r11
	je	_return_T_done_\@
_T_123_\@:
	movd	%xmm0, %eax
	cmp	$2, %r11
	jl	_T_1_\@
	mov	%ax, (%r10)
	cmp	$2, %r11
	je	_return_T_done_\@
	add	$2, %r10
	sar	$16, %eax
_T_1_\@:
	mov	%al, (%r10)
	jmp	_return_T_done_\@
_T_16_\@:
	movdqu	%xmm0, (%r10)
_return_T_done_\@:
.endm

448
#ifdef __x86_64__
449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
*
*
* Input: A and B (128-bits each, bit-reflected)
* Output: C = A*B*x mod poly, (i.e. >>1 )
* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
*
*/
.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
	movdqa	  \GH, \TMP1
	pshufd	  $78, \GH, \TMP2
	pshufd	  $78, \HK, \TMP3
	pxor	  \GH, \TMP2            # TMP2 = a1+a0
	pxor	  \HK, \TMP3            # TMP3 = b1+b0
	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
	pxor	  \GH, \TMP2
	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
	pxor	  \TMP3, \GH
	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK

        # first phase of the reduction

	movdqa    \GH, \TMP2
	movdqa    \GH, \TMP3
	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	pslld     $31, \TMP2            # packed right shift <<31
	pslld     $30, \TMP3            # packed right shift <<30
	pslld     $25, \TMP4            # packed right shift <<25
	pxor      \TMP3, \TMP2          # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5             # right shift TMP5 1 DW
	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
	pxor      \TMP2, \GH

        # second phase of the reduction

	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	movdqa    \GH,\TMP3
	movdqa    \GH,\TMP4
	psrld     $1,\TMP2              # packed left shift >>1
	psrld     $2,\TMP3              # packed left shift >>2
	psrld     $7,\TMP4              # packed left shift >>7
	pxor      \TMP3,\TMP2		# xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \GH
	pxor      \TMP1, \GH            # result is in TMP1
.endm

509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
# Reads DLEN bytes starting at DPTR and stores in XMMDst
# where 0 < DLEN < 16
# Clobbers %rax, DLEN and XMM1
.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
        cmp $8, \DLEN
        jl _read_lt8_\@
        mov (\DPTR), %rax
        MOVQ_R64_XMM %rax, \XMMDst
        sub $8, \DLEN
        jz _done_read_partial_block_\@
	xor %eax, %eax
_read_next_byte_\@:
        shl $8, %rax
        mov 7(\DPTR, \DLEN, 1), %al
        dec \DLEN
        jnz _read_next_byte_\@
        MOVQ_R64_XMM %rax, \XMM1
	pslldq $8, \XMM1
        por \XMM1, \XMMDst
	jmp _done_read_partial_block_\@
_read_lt8_\@:
	xor %eax, %eax
_read_next_byte_lt8_\@:
        shl $8, %rax
        mov -1(\DPTR, \DLEN, 1), %al
        dec \DLEN
        jnz _read_next_byte_lt8_\@
        MOVQ_R64_XMM %rax, \XMMDst
_done_read_partial_block_\@:
.endm

540 541 542 543 544 545 546 547 548
# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
# clobbers r10-11, xmm14
.macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
	TMP6 TMP7
	MOVADQ	   SHUF_MASK(%rip), %xmm14
	mov	   arg8, %r10		# %r10 = AAD
	mov	   arg9, %r11		# %r11 = aadLen
	pxor	   \TMP7, \TMP7
	pxor	   \TMP6, \TMP6
549 550

	cmp	   $16, %r11
551 552
	jl	   _get_AAD_rest\@
_get_AAD_blocks\@:
553 554 555 556
	movdqu	   (%r10), \TMP7
	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
	pxor	   \TMP7, \TMP6
	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
557 558 559
	add	   $16, %r10
	sub	   $16, %r11
	cmp	   $16, %r11
560
	jge	   _get_AAD_blocks\@
561

562
	movdqu	   \TMP6, \TMP7
563 564

	/* read the last <16B of AAD */
565
_get_AAD_rest\@:
566
	cmp	   $0, %r11
567
	je	   _get_AAD_done\@
568

569 570 571 572 573
	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
	pxor	   \TMP6, \TMP7
	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
	movdqu \TMP7, \TMP6
574

575
_get_AAD_done\@:
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
	movdqu \TMP6, AadHash(%arg2)
.endm

/*
* if a = number of total plaintext bytes
* b = floor(a/16)
* num_initial_blocks = b mod 4
* encrypt the initial num_initial_blocks blocks and apply ghash on
* the ciphertext
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
* are clobbered
* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
*/


.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
593
	MOVADQ		SHUF_MASK(%rip), %xmm14
594 595 596

	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0

597
	xor	   %r11, %r11 # initialise the data pointer offset as zero
598
	# start AES for num_initial_blocks blocks
599

600
	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
601 602 603

.if (\i == 5) || (\i == 6) || (\i == 7)

604 605
	MOVADQ		ONE(%RIP),\TMP1
	MOVADQ		0(%arg1),\TMP2
606
.irpc index, \i_seq
607
	paddd		\TMP1, \XMM0                 # INCR Y0
608 609 610
.ifc \operation, dec
        movdqa     \XMM0, %xmm\index
.else
611
	MOVADQ		\XMM0, %xmm\index
612
.endif
613 614
	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
	pxor		\TMP2, %xmm\index
615
.endr
616 617 618 619 620
	lea	0x10(%arg1),%r10
	mov	keysize,%eax
	shr	$2,%eax				# 128->4, 192->6, 256->8
	add	$5,%eax			      # 128->9, 192->11, 256->13

621
aes_loop_initial_\@:
622 623 624
	MOVADQ	(%r10),\TMP1
.irpc	index, \i_seq
	AESENC	\TMP1, %xmm\index
625
.endr
626 627
	add	$16,%r10
	sub	$1,%eax
628
	jnz	aes_loop_initial_\@
629 630

	MOVADQ	(%r10), \TMP1
631
.irpc index, \i_seq
632
	AESENCLAST \TMP1, %xmm\index         # Last Round
633 634
.endr
.irpc index, \i_seq
635
	movdqu	   (%arg4 , %r11, 1), \TMP1
636
	pxor	   \TMP1, %xmm\index
637
	movdqu	   %xmm\index, (%arg3 , %r11, 1)
638 639
	# write back plaintext/ciphertext for num_initial_blocks
	add	   $16, %r11
640 641 642 643

.ifc \operation, dec
	movdqa     \TMP1, %xmm\index
.endif
644 645 646 647 648
	PSHUFB_XMM	   %xmm14, %xmm\index

		# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
649

650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
        # apply GHASH on num_initial_blocks blocks

.if \i == 5
        pxor       %xmm5, %xmm6
	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 6
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 7
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
	cmp	   $64, %r13
669
	jl	_initial_blocks_done\@
670 671 672 673 674 675
	# no need for precomputed values
/*
*
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
676 677 678
	MOVADQ	   ONE(%RIP),\TMP1
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM1
679 680
	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap

681 682
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM2
683 684
	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap

685 686
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM3
687 688
	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap

689 690
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM4
691 692
	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap

693 694 695 696 697
	MOVADQ	   0(%arg1),\TMP1
	pxor	   \TMP1, \XMM1
	pxor	   \TMP1, \XMM2
	pxor	   \TMP1, \XMM3
	pxor	   \TMP1, \XMM4
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
	movdqa	   \TMP3, \TMP5
	pshufd	   $78, \TMP3, \TMP1
	pxor	   \TMP3, \TMP1
	movdqa	   \TMP1, HashKey_k(%rsp)
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
	movdqa	   \TMP5, HashKey_2(%rsp)
# HashKey_2 = HashKey^2<<1 (mod poly)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_2_k(%rsp)
.irpc index, 1234 # do 4 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_3(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_3_k(%rsp)
.irpc index, 56789 # do next 5 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_4(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_4_k(%rsp)
735 736 737 738
	lea	   0xa0(%arg1),%r10
	mov	   keysize,%eax
	shr	   $2,%eax			# 128->4, 192->6, 256->8
	sub	   $4,%eax			# 128->0, 192->2, 256->4
739
	jz	   aes_loop_pre_done\@
740

741
aes_loop_pre_\@:
742 743 744 745 746 747
	MOVADQ	   (%r10),\TMP2
.irpc	index, 1234
	AESENC	   \TMP2, %xmm\index
.endr
	add	   $16,%r10
	sub	   $1,%eax
748
	jnz	   aes_loop_pre_\@
749

750
aes_loop_pre_done\@:
751
	MOVADQ	   (%r10), \TMP2
752 753 754 755
	AESENCLAST \TMP2, \XMM1
	AESENCLAST \TMP2, \XMM2
	AESENCLAST \TMP2, \XMM3
	AESENCLAST \TMP2, \XMM4
756
	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
757
	pxor	   \TMP1, \XMM1
758
.ifc \operation, dec
759
	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
760 761
	movdqa     \TMP1, \XMM1
.endif
762
	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
763
	pxor	   \TMP1, \XMM2
764
.ifc \operation, dec
765
	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
766 767
	movdqa     \TMP1, \XMM2
.endif
768
	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
769
	pxor	   \TMP1, \XMM3
770
.ifc \operation, dec
771
	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
772 773
	movdqa     \TMP1, \XMM3
.endif
774
	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
775
	pxor	   \TMP1, \XMM4
776
.ifc \operation, dec
777
	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
778 779
	movdqa     \TMP1, \XMM4
.else
780 781 782 783
	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
784
.endif
785

786
	add	   $64, %r11
787
	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
788 789
	pxor	   \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
790 791 792 793
	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap

794
_initial_blocks_done\@:
795

796 797 798 799 800
.endm

/*
* encrypt 4 blocks at a time
* ghash the 4 previously encrypted ciphertext blocks
801
* arg1, %arg3, %arg4 are used as pointers only, not modified
802 803
* %r11 is the data offset value
*/
804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation

	movdqa	  \XMM1, \XMM5
	movdqa	  \XMM2, \XMM6
	movdqa	  \XMM3, \XMM7
	movdqa	  \XMM4, \XMM8

        movdqa    SHUF_MASK(%rip), %xmm15
        # multiply TMP5 * HashKey using karatsuba

	movdqa	  \XMM5, \TMP4
	pshufd	  $78, \XMM5, \TMP6
	pxor	  \XMM5, \TMP6
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
	movdqa    \XMM0, \XMM1
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM2
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM3
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM4
	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

	pxor	  (%arg1), \XMM1
	pxor	  (%arg1), \XMM2
	pxor	  (%arg1), \XMM3
	pxor	  (%arg1), \XMM4
	movdqa	  HashKey_4_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
	movaps 0x10(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 1
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movaps 0x20(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 2
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movdqa	  \XMM6, \TMP1
	pshufd	  $78, \XMM6, \TMP2
	pxor	  \XMM6, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
	movaps 0x30(%arg1), \TMP3
	AESENC    \TMP3, \XMM1              # Round 3
	AESENC    \TMP3, \XMM2
	AESENC    \TMP3, \XMM3
	AESENC    \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
	movaps 0x40(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 4
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_3_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x50(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 5
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM6, \XMM5
	pxor	  \TMP2, \TMP6
	movdqa	  \XMM7, \TMP1
	pshufd	  $78, \XMM7, \TMP2
	pxor	  \XMM7, \TMP2
	movdqa	  HashKey_2(%rsp ), \TMP5

        # Multiply TMP5 * HashKey using karatsuba

	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
	movaps 0x60(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 6
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
	movaps 0x70(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 7
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_2_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x80(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 8
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM7, \XMM5
	pxor	  \TMP2, \TMP6

        # Multiply XMM8 * HashKey
        # XMM8 and TMP5 hold the values for the two operands

	movdqa	  \XMM8, \TMP1
	pshufd	  $78, \XMM8, \TMP2
	pxor	  \XMM8, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
	movaps 0x90(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1            # Round 9
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax			# 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
	jz	  aes_loop_par_enc_done

aes_loop_par_enc:
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
	jnz	  aes_loop_par_enc

aes_loop_par_enc_done:
	MOVADQ	  (%r10), \TMP3
939 940 941 942 943 944
	AESENCLAST \TMP3, \XMM1           # Round 10
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
	movdqa    HashKey_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
945
	movdqu	  (%arg4,%r11,1), \TMP3
946
	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
947
	movdqu	  16(%arg4,%r11,1), \TMP3
948
	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
949
	movdqu	  32(%arg4,%r11,1), \TMP3
950
	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
951
	movdqu	  48(%arg4,%r11,1), \TMP3
952
	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
953 954 955 956
        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

	pxor	  \TMP4, \TMP1
	pxor	  \XMM8, \XMM5
	pxor	  \TMP6, \TMP2
	pxor	  \TMP1, \TMP2
	pxor	  \XMM5, \TMP2
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
	pxor	  \TMP3, \XMM5
	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5

        # first phase of reduction

	movdqa    \XMM5, \TMP2
	movdqa    \XMM5, \TMP3
	movdqa    \XMM5, \TMP4
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
	pslld     $31, \TMP2                   # packed right shift << 31
	pslld     $30, \TMP3                   # packed right shift << 30
	pslld     $25, \TMP4                   # packed right shift << 25
	pxor      \TMP3, \TMP2	               # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5                    # right shift T5 1 DW
	pslldq    $12, \TMP2                   # left shift T2 3 DWs
	pxor      \TMP2, \XMM5

        # second phase of reduction

	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
	movdqa    \XMM5,\TMP3
	movdqa    \XMM5,\TMP4
	psrld     $1, \TMP2                    # packed left shift >>1
	psrld     $2, \TMP3                    # packed left shift >>2
	psrld     $7, \TMP4                    # packed left shift >>7
	pxor      \TMP3,\TMP2		       # xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \XMM5
	pxor      \TMP1, \XMM5                 # result is in TMP1

	pxor	  \XMM5, \XMM1
.endm

/*
* decrypt 4 blocks at a time
* ghash the 4 previously decrypted ciphertext blocks
1009
* arg1, %arg3, %arg4 are used as pointers only, not modified
1010 1011 1012
* %r11 is the data offset value
*/
.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1013 1014 1015 1016 1017 1018 1019
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation

	movdqa	  \XMM1, \XMM5
	movdqa	  \XMM2, \XMM6
	movdqa	  \XMM3, \XMM7
	movdqa	  \XMM4, \XMM8

1020
        movdqa    SHUF_MASK(%rip), %xmm15
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
        # multiply TMP5 * HashKey using karatsuba

	movdqa	  \XMM5, \TMP4
	pshufd	  $78, \XMM5, \TMP6
	pxor	  \XMM5, \TMP6
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
	movdqa    \XMM0, \XMM1
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM2
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM3
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM4
1036
	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1037
	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1038 1039 1040 1041
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
	pxor	  (%arg1), \XMM1
	pxor	  (%arg1), \XMM2
	pxor	  (%arg1), \XMM3
	pxor	  (%arg1), \XMM4
	movdqa	  HashKey_4_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
	movaps 0x10(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 1
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movaps 0x20(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 2
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movdqa	  \XMM6, \TMP1
	pshufd	  $78, \XMM6, \TMP2
	pxor	  \XMM6, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
	movaps 0x30(%arg1), \TMP3
	AESENC    \TMP3, \XMM1              # Round 3
	AESENC    \TMP3, \XMM2
	AESENC    \TMP3, \XMM3
	AESENC    \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
	movaps 0x40(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 4
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_3_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x50(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 5
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM6, \XMM5
	pxor	  \TMP2, \TMP6
	movdqa	  \XMM7, \TMP1
	pshufd	  $78, \XMM7, \TMP2
	pxor	  \XMM7, \TMP2
	movdqa	  HashKey_2(%rsp ), \TMP5

        # Multiply TMP5 * HashKey using karatsuba

	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
	movaps 0x60(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 6
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
	movaps 0x70(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 7
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_2_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x80(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 8
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM7, \XMM5
	pxor	  \TMP2, \TMP6

        # Multiply XMM8 * HashKey
        # XMM8 and TMP5 hold the values for the two operands

	movdqa	  \XMM8, \TMP1
	pshufd	  $78, \XMM8, \TMP2
	pxor	  \XMM8, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
	movaps 0x90(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1            # Round 9
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax		        # 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
	jz	  aes_loop_par_dec_done

aes_loop_par_dec:
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
	jnz	  aes_loop_par_dec

aes_loop_par_dec_done:
	MOVADQ	  (%r10), \TMP3
	AESENCLAST \TMP3, \XMM1           # last round
1148 1149 1150 1151 1152
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
	movdqa    HashKey_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1153
	movdqu	  (%arg4,%r11,1), \TMP3
1154
	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1155
	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1156
	movdqa    \TMP3, \XMM1
1157
	movdqu	  16(%arg4,%r11,1), \TMP3
1158
	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1159
	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1160
	movdqa    \TMP3, \XMM2
1161
	movdqu	  32(%arg4,%r11,1), \TMP3
1162
	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1163
	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1164
	movdqa    \TMP3, \XMM3
1165
	movdqu	  48(%arg4,%r11,1), \TMP3
1166
	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1167
	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1168
	movdqa    \TMP3, \XMM4
1169 1170 1171 1172
	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315

	pxor	  \TMP4, \TMP1
	pxor	  \XMM8, \XMM5
	pxor	  \TMP6, \TMP2
	pxor	  \TMP1, \TMP2
	pxor	  \XMM5, \TMP2
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
	pxor	  \TMP3, \XMM5
	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5

        # first phase of reduction

	movdqa    \XMM5, \TMP2
	movdqa    \XMM5, \TMP3
	movdqa    \XMM5, \TMP4
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
	pslld     $31, \TMP2                   # packed right shift << 31
	pslld     $30, \TMP3                   # packed right shift << 30
	pslld     $25, \TMP4                   # packed right shift << 25
	pxor      \TMP3, \TMP2	               # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5                    # right shift T5 1 DW
	pslldq    $12, \TMP2                   # left shift T2 3 DWs
	pxor      \TMP2, \XMM5

        # second phase of reduction

	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
	movdqa    \XMM5,\TMP3
	movdqa    \XMM5,\TMP4
	psrld     $1, \TMP2                    # packed left shift >>1
	psrld     $2, \TMP3                    # packed left shift >>2
	psrld     $7, \TMP4                    # packed left shift >>7
	pxor      \TMP3,\TMP2		       # xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \XMM5
	pxor      \TMP1, \XMM5                 # result is in TMP1

	pxor	  \XMM5, \XMM1
.endm

/* GHASH the last 4 ciphertext blocks. */
.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst

        # Multiply TMP6 * HashKey (using Karatsuba)

	movdqa	  \XMM1, \TMP6
	pshufd	  $78, \XMM1, \TMP2
	pxor	  \XMM1, \TMP2
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
	movdqa	  HashKey_4_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	movdqa	  \XMM1, \XMMDst
	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM2, \TMP1
	pshufd	  $78, \XMM2, \TMP2
	pxor	  \XMM2, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
	movdqa	  HashKey_3_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM2, \XMMDst
	pxor	  \TMP2, \XMM1
# results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM3, \TMP1
	pshufd	  $78, \XMM3, \TMP2
	pxor	  \XMM3, \TMP2
	movdqa	  HashKey_2(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
	movdqa	  HashKey_2_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM3, \XMMDst
	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)
	movdqa	  \XMM4, \TMP1
	pshufd	  $78, \XMM4, \TMP2
	pxor	  \XMM4, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
	movdqa	  HashKey_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM4, \XMMDst
	pxor	  \XMM1, \TMP2
	pxor	  \TMP6, \TMP2
	pxor	  \XMMDst, \TMP2
	# middle section of the temp results combined as in karatsuba algorithm
	movdqa	  \TMP2, \TMP4
	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
	pxor	  \TMP4, \XMMDst
	pxor	  \TMP2, \TMP6
# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
	# first phase of the reduction
	movdqa    \XMMDst, \TMP2
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
	pslld     $31, \TMP2                # packed right shifting << 31
	pslld     $30, \TMP3                # packed right shifting << 30
	pslld     $25, \TMP4                # packed right shifting << 25
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP7
	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
	pxor      \TMP2, \XMMDst

        # second phase of the reduction
	movdqa    \XMMDst, \TMP2
	# make 3 copies of XMMDst for doing 3 shift operations
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
	psrld     $1, \TMP2                 # packed left shift >> 1
	psrld     $2, \TMP3                 # packed left shift >> 2
	psrld     $7, \TMP4                 # packed left shift >> 7
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	pxor      \TMP7, \TMP2
	pxor      \TMP2, \XMMDst
	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
.endm


1316 1317 1318
/* Encryption of a single block
* uses eax & r10
*/
1319

1320
.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1321

1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337
	pxor		(%arg1), \XMM0
	mov		keysize,%eax
	shr		$2,%eax			# 128->4, 192->6, 256->8
	add		$5,%eax			# 128->9, 192->11, 256->13
	lea		16(%arg1), %r10	  # get first expanded key address

_esb_loop_\@:
	MOVADQ		(%r10),\TMP1
	AESENC		\TMP1,\XMM0
	add		$16,%r10
	sub		$1,%eax
	jnz		_esb_loop_\@

	MOVADQ		(%r10),\TMP1
	AESENCLAST	\TMP1,\XMM0
.endm
1338 1339
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1340 1341
*                   struct gcm_context_data *data
*                                      // Context data
1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418
*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
*                   const u8 *in,      // Ciphertext input
*                   u64 plaintext_len, // Length of data in bytes for decryption.
*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                   const u8 *aad,     // Additional Authentication Data (AAD)
*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
*                                      // given authentication tag and only return the plaintext if they match.
*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
*                                      // (most likely), 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the first
*       set of 11 keys in the data structure void *aes_ctx
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                       AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                        AAD Format with 64-bit Extended Sequence Number
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/
ENTRY(aesni_gcm_dec)
1419
	FUNC_SAVE
1420

D
Dave Watson 已提交
1421
	GCM_INIT
1422
	GCM_ENC_DEC dec
1423
	GCM_COMPLETE
1424
	FUNC_RESTORE
1425
	ret
1426
ENDPROC(aesni_gcm_dec)
1427 1428 1429 1430


/*****************************************************************************
* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1431 1432
*                    struct gcm_context_data *data
*                                        // Context data
1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506
*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
*                    const u8 *in,       // Plaintext input
*                    u64 plaintext_len,  // Length of data in bytes for encryption.
*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                    const u8 *aad,      // Additional Authentication Data (AAD)
*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                    u8 *auth_tag,       // Authenticated Tag output.
*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
*                                        // 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the
*       first set of 11 keys in the data structure void *aes_ctx
*
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                 AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                         AAD Format with 64-bit Extended Sequence Number
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
***************************************************************************/
ENTRY(aesni_gcm_enc)
1507
	FUNC_SAVE
1508

D
Dave Watson 已提交
1509
	GCM_INIT
1510
	GCM_ENC_DEC enc
1511
	GCM_COMPLETE
1512
	FUNC_RESTORE
1513
	ret
1514
ENDPROC(aesni_gcm_enc)
1515

1516
#endif
1517 1518


1519
.align 4
1520 1521 1522 1523 1524 1525 1526 1527
_key_expansion_128:
_key_expansion_256a:
	pshufd $0b11111111, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0
1528 1529
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1530
	ret
1531 1532
ENDPROC(_key_expansion_128)
ENDPROC(_key_expansion_256a)
1533

1534
.align 4
1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551
_key_expansion_192a:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	movaps %xmm2, %xmm6
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

	movaps %xmm0, %xmm1
	shufps $0b01000100, %xmm0, %xmm6
1552
	movaps %xmm6, (TKEYP)
1553
	shufps $0b01001110, %xmm2, %xmm1
1554 1555
	movaps %xmm1, 0x10(TKEYP)
	add $0x20, TKEYP
1556
	ret
1557
ENDPROC(_key_expansion_192a)
1558

1559
.align 4
1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573
_key_expansion_192b:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

1574 1575
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1576
	ret
1577
ENDPROC(_key_expansion_192b)
1578

1579
.align 4
1580 1581 1582 1583 1584 1585 1586
_key_expansion_256b:
	pshufd $0b10101010, %xmm1, %xmm1
	shufps $0b00010000, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	shufps $0b10001100, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	pxor %xmm1, %xmm2
1587 1588
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1589
	ret
1590
ENDPROC(_key_expansion_256b)
1591 1592 1593 1594 1595 1596

/*
 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 *                   unsigned int key_len)
 */
ENTRY(aesni_set_key)
1597
	FRAME_BEGIN
1598 1599
#ifndef __x86_64__
	pushl KEYP
1600 1601 1602
	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1603 1604 1605 1606 1607
#endif
	movups (UKEYP), %xmm0		# user key (first 16 bytes)
	movaps %xmm0, (KEYP)
	lea 0x10(KEYP), TKEYP		# key addr
	movl %edx, 480(KEYP)
1608 1609 1610 1611
	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
	cmp $24, %dl
	jb .Lenc_key128
	je .Lenc_key192
1612 1613 1614
	movups 0x10(UKEYP), %xmm2	# other user key
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1615
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1616
	call _key_expansion_256a
1617
	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1618
	call _key_expansion_256b
1619
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1620
	call _key_expansion_256a
1621
	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1622
	call _key_expansion_256b
1623
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1624
	call _key_expansion_256a
1625
	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1626
	call _key_expansion_256b
1627
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1628
	call _key_expansion_256a
1629
	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1630
	call _key_expansion_256b
1631
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1632
	call _key_expansion_256a
1633
	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1634
	call _key_expansion_256b
1635
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1636
	call _key_expansion_256a
1637
	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1638
	call _key_expansion_256b
1639
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1640 1641 1642
	call _key_expansion_256a
	jmp .Ldec_key
.Lenc_key192:
1643
	movq 0x10(UKEYP), %xmm2		# other user key
1644
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1645
	call _key_expansion_192a
1646
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1647
	call _key_expansion_192b
1648
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1649
	call _key_expansion_192a
1650
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1651
	call _key_expansion_192b
1652
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1653
	call _key_expansion_192a
1654
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1655
	call _key_expansion_192b
1656
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1657
	call _key_expansion_192a
1658
	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1659 1660 1661
	call _key_expansion_192b
	jmp .Ldec_key
.Lenc_key128:
1662
	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1663
	call _key_expansion_128
1664
	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1665
	call _key_expansion_128
1666
	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1667
	call _key_expansion_128
1668
	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1669
	call _key_expansion_128
1670
	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1671
	call _key_expansion_128
1672
	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1673
	call _key_expansion_128
1674
	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1675
	call _key_expansion_128
1676
	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1677
	call _key_expansion_128
1678
	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1679
	call _key_expansion_128
1680
	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1681 1682
	call _key_expansion_128
.Ldec_key:
1683 1684 1685 1686 1687 1688 1689
	sub $0x10, TKEYP
	movaps (KEYP), %xmm0
	movaps (TKEYP), %xmm1
	movaps %xmm0, 240(TKEYP)
	movaps %xmm1, 240(KEYP)
	add $0x10, KEYP
	lea 240-16(TKEYP), UKEYP
1690 1691
.align 4
.Ldec_key_loop:
1692
	movaps (KEYP), %xmm0
1693
	AESIMC %xmm0 %xmm1
1694 1695 1696 1697
	movaps %xmm1, (UKEYP)
	add $0x10, KEYP
	sub $0x10, UKEYP
	cmp TKEYP, KEYP
1698
	jb .Ldec_key_loop
1699 1700 1701 1702
	xor AREG, AREG
#ifndef __x86_64__
	popl KEYP
#endif
1703
	FRAME_END
1704
	ret
1705
ENDPROC(aesni_set_key)
1706 1707 1708 1709 1710

/*
 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_enc)
1711
	FRAME_BEGIN
1712 1713 1714
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
1715 1716 1717
	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+20)(%esp), INP	# src
1718
#endif
1719 1720 1721 1722
	movl 480(KEYP), KLEN		# key length
	movups (INP), STATE		# input
	call _aesni_enc1
	movups STATE, (OUTP)		# output
1723 1724 1725 1726
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
1727
	FRAME_END
1728
	ret
1729
ENDPROC(aesni_enc)
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742

/*
 * _aesni_enc1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1743
.align 4
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754
_aesni_enc1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Lenc128
	lea 0x20(TKEYP), TKEYP
	je .Lenc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
1755
	AESENC KEY STATE
1756
	movaps -0x50(TKEYP), KEY
1757
	AESENC KEY STATE
1758 1759 1760
.align 4
.Lenc192:
	movaps -0x40(TKEYP), KEY
1761
	AESENC KEY STATE
1762
	movaps -0x30(TKEYP), KEY
1763
	AESENC KEY STATE
1764 1765 1766
.align 4
.Lenc128:
	movaps -0x20(TKEYP), KEY
1767
	AESENC KEY STATE
1768
	movaps -0x10(TKEYP), KEY
1769
	AESENC KEY STATE
1770
	movaps (TKEYP), KEY
1771
	AESENC KEY STATE
1772
	movaps 0x10(TKEYP), KEY
1773
	AESENC KEY STATE
1774
	movaps 0x20(TKEYP), KEY
1775
	AESENC KEY STATE
1776
	movaps 0x30(TKEYP), KEY
1777
	AESENC KEY STATE
1778
	movaps 0x40(TKEYP), KEY
1779
	AESENC KEY STATE
1780
	movaps 0x50(TKEYP), KEY
1781
	AESENC KEY STATE
1782
	movaps 0x60(TKEYP), KEY
1783
	AESENC KEY STATE
1784
	movaps 0x70(TKEYP), KEY
1785
	AESENCLAST KEY STATE
1786
	ret
1787
ENDPROC(_aesni_enc1)
1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806

/*
 * _aesni_enc4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1807
.align 4
1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821
_aesni_enc4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4enc128
	lea 0x20(TKEYP), TKEYP
	je .L4enc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
1822 1823 1824 1825
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1826
	movaps -0x50(TKEYP), KEY
1827 1828 1829 1830
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1831 1832 1833
#.align 4
.L4enc192:
	movaps -0x40(TKEYP), KEY
1834 1835 1836 1837
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1838
	movaps -0x30(TKEYP), KEY
1839 1840 1841 1842
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1843 1844 1845
#.align 4
.L4enc128:
	movaps -0x20(TKEYP), KEY
1846 1847 1848 1849
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1850
	movaps -0x10(TKEYP), KEY
1851 1852 1853 1854
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1855
	movaps (TKEYP), KEY
1856 1857 1858 1859
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1860
	movaps 0x10(TKEYP), KEY
1861 1862 1863 1864
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1865
	movaps 0x20(TKEYP), KEY
1866 1867 1868 1869
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1870
	movaps 0x30(TKEYP), KEY
1871 1872 1873 1874
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1875
	movaps 0x40(TKEYP), KEY
1876 1877 1878 1879
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1880
	movaps 0x50(TKEYP), KEY
1881 1882 1883 1884
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1885
	movaps 0x60(TKEYP), KEY
1886 1887 1888 1889
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
1890
	movaps 0x70(TKEYP), KEY
1891 1892 1893 1894
	AESENCLAST KEY STATE1		# last round
	AESENCLAST KEY STATE2
	AESENCLAST KEY STATE3
	AESENCLAST KEY STATE4
1895
	ret
1896
ENDPROC(_aesni_enc4)
1897 1898 1899 1900 1901

/*
 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_dec)
1902
	FRAME_BEGIN
1903 1904 1905
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
1906 1907 1908
	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+20)(%esp), INP	# src
1909
#endif
1910 1911 1912 1913 1914
	mov 480(KEYP), KLEN		# key length
	add $240, KEYP
	movups (INP), STATE		# input
	call _aesni_dec1
	movups STATE, (OUTP)		#output
1915 1916 1917 1918
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
1919
	FRAME_END
1920
	ret
1921
ENDPROC(aesni_dec)
1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934

/*
 * _aesni_dec1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1935
.align 4
1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946
_aesni_dec1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Ldec128
	lea 0x20(TKEYP), TKEYP
	je .Ldec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
1947
	AESDEC KEY STATE
1948
	movaps -0x50(TKEYP), KEY
1949
	AESDEC KEY STATE
1950 1951 1952
.align 4
.Ldec192:
	movaps -0x40(TKEYP), KEY
1953
	AESDEC KEY STATE
1954
	movaps -0x30(TKEYP), KEY
1955
	AESDEC KEY STATE
1956 1957 1958
.align 4
.Ldec128:
	movaps -0x20(TKEYP), KEY
1959
	AESDEC KEY STATE
1960
	movaps -0x10(TKEYP), KEY
1961
	AESDEC KEY STATE
1962
	movaps (TKEYP), KEY
1963
	AESDEC KEY STATE
1964
	movaps 0x10(TKEYP), KEY
1965
	AESDEC KEY STATE
1966
	movaps 0x20(TKEYP), KEY
1967
	AESDEC KEY STATE
1968
	movaps 0x30(TKEYP), KEY
1969
	AESDEC KEY STATE
1970
	movaps 0x40(TKEYP), KEY
1971
	AESDEC KEY STATE
1972
	movaps 0x50(TKEYP), KEY
1973
	AESDEC KEY STATE
1974
	movaps 0x60(TKEYP), KEY
1975
	AESDEC KEY STATE
1976
	movaps 0x70(TKEYP), KEY
1977
	AESDECLAST KEY STATE
1978
	ret
1979
ENDPROC(_aesni_dec1)
1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998

/*
 * _aesni_dec4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1999
.align 4
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
_aesni_dec4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4dec128
	lea 0x20(TKEYP), TKEYP
	je .L4dec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2014 2015 2016 2017
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2018
	movaps -0x50(TKEYP), KEY
2019 2020 2021 2022
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2023 2024 2025
.align 4
.L4dec192:
	movaps -0x40(TKEYP), KEY
2026 2027 2028 2029
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2030
	movaps -0x30(TKEYP), KEY
2031 2032 2033 2034
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2035 2036 2037
.align 4
.L4dec128:
	movaps -0x20(TKEYP), KEY
2038 2039 2040 2041
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2042
	movaps -0x10(TKEYP), KEY
2043 2044 2045 2046
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2047
	movaps (TKEYP), KEY
2048 2049 2050 2051
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2052
	movaps 0x10(TKEYP), KEY
2053 2054 2055 2056
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2057
	movaps 0x20(TKEYP), KEY
2058 2059 2060 2061
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2062
	movaps 0x30(TKEYP), KEY
2063 2064 2065 2066
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2067
	movaps 0x40(TKEYP), KEY
2068 2069 2070 2071
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2072
	movaps 0x50(TKEYP), KEY
2073 2074 2075 2076
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2077
	movaps 0x60(TKEYP), KEY
2078 2079 2080 2081
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2082
	movaps 0x70(TKEYP), KEY
2083 2084 2085 2086
	AESDECLAST KEY STATE1		# last round
	AESDECLAST KEY STATE2
	AESDECLAST KEY STATE3
	AESDECLAST KEY STATE4
2087
	ret
2088
ENDPROC(_aesni_dec4)
2089 2090 2091 2092 2093 2094

/*
 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len)
 */
ENTRY(aesni_ecb_enc)
2095
	FRAME_BEGIN
2096 2097 2098 2099
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
2100 2101 2102 2103
	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+24)(%esp), INP	# src
	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2104
#endif
2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140
	test LEN, LEN		# check length
	jz .Lecb_enc_ret
	mov 480(KEYP), KLEN
	cmp $16, LEN
	jb .Lecb_enc_ret
	cmp $64, LEN
	jb .Lecb_enc_loop1
.align 4
.Lecb_enc_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_enc4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_enc_loop4
	cmp $16, LEN
	jb .Lecb_enc_ret
.align 4
.Lecb_enc_loop1:
	movups (INP), STATE1
	call _aesni_enc1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_enc_loop1
.Lecb_enc_ret:
2141 2142 2143 2144 2145
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
2146
	FRAME_END
2147
	ret
2148
ENDPROC(aesni_ecb_enc)
2149 2150 2151 2152 2153 2154

/*
 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len);
 */
ENTRY(aesni_ecb_dec)
2155
	FRAME_BEGIN
2156 2157 2158 2159
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
2160 2161 2162 2163
	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+24)(%esp), INP	# src
	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2164
#endif
2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
	test LEN, LEN
	jz .Lecb_dec_ret
	mov 480(KEYP), KLEN
	add $240, KEYP
	cmp $16, LEN
	jb .Lecb_dec_ret
	cmp $64, LEN
	jb .Lecb_dec_loop1
.align 4
.Lecb_dec_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_dec4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_dec_loop4
	cmp $16, LEN
	jb .Lecb_dec_ret
.align 4
.Lecb_dec_loop1:
	movups (INP), STATE1
	call _aesni_dec1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_dec_loop1
.Lecb_dec_ret:
2202 2203 2204 2205 2206
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
2207
	FRAME_END
2208
	ret
2209
ENDPROC(aesni_ecb_dec)
2210 2211 2212 2213 2214 2215

/*
 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_enc)
2216
	FRAME_BEGIN
2217 2218 2219 2220 2221
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
2222 2223 2224 2225 2226
	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+28)(%esp), INP	# src
	movl (FRAME_OFFSET+32)(%esp), LEN	# len
	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2227
#endif
2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244
	cmp $16, LEN
	jb .Lcbc_enc_ret
	mov 480(KEYP), KLEN
	movups (IVP), STATE	# load iv as initial state
.align 4
.Lcbc_enc_loop:
	movups (INP), IN	# load input
	pxor IN, STATE
	call _aesni_enc1
	movups STATE, (OUTP)	# store output
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_enc_loop
	movups STATE, (IVP)
.Lcbc_enc_ret:
2245 2246 2247 2248 2249 2250
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
2251
	FRAME_END
2252
	ret
2253
ENDPROC(aesni_cbc_enc)
2254 2255 2256 2257 2258 2259

/*
 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_dec)
2260
	FRAME_BEGIN
2261 2262 2263 2264 2265
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
2266 2267 2268 2269 2270
	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+28)(%esp), INP	# src
	movl (FRAME_OFFSET+32)(%esp), LEN	# len
	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2271
#endif
2272
	cmp $16, LEN
2273
	jb .Lcbc_dec_just_ret
2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284
	mov 480(KEYP), KLEN
	add $240, KEYP
	movups (IVP), IV
	cmp $64, LEN
	jb .Lcbc_dec_loop1
.align 4
.Lcbc_dec_loop4:
	movups (INP), IN1
	movaps IN1, STATE1
	movups 0x10(INP), IN2
	movaps IN2, STATE2
2285
#ifdef __x86_64__
2286 2287 2288 2289
	movups 0x20(INP), IN3
	movaps IN3, STATE3
	movups 0x30(INP), IN4
	movaps IN4, STATE4
2290 2291 2292 2293 2294 2295
#else
	movups 0x20(INP), IN1
	movaps IN1, STATE3
	movups 0x30(INP), IN2
	movaps IN2, STATE4
#endif
2296 2297
	call _aesni_dec4
	pxor IV, STATE1
2298
#ifdef __x86_64__
2299 2300 2301 2302
	pxor IN1, STATE2
	pxor IN2, STATE3
	pxor IN3, STATE4
	movaps IN4, IV
2303 2304 2305
#else
	pxor IN1, STATE4
	movaps IN2, IV
2306 2307 2308 2309
	movups (INP), IN1
	pxor IN1, STATE2
	movups 0x10(INP), IN2
	pxor IN2, STATE3
2310
#endif
2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lcbc_dec_loop4
	cmp $16, LEN
	jb .Lcbc_dec_ret
.align 4
.Lcbc_dec_loop1:
	movups (INP), IN
	movaps IN, STATE
	call _aesni_dec1
	pxor IV, STATE
	movups STATE, (OUTP)
	movaps IN, IV
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_dec_loop1
.Lcbc_dec_ret:
2336 2337
	movups IV, (IVP)
.Lcbc_dec_just_ret:
2338 2339 2340 2341 2342 2343
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
2344
	FRAME_END
2345
	ret
2346
ENDPROC(aesni_cbc_dec)
2347

2348
#ifdef __x86_64__
2349
.pushsection .rodata
2350 2351 2352
.align 16
.Lbswap_mask:
	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2353
.popsection
2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365

/*
 * _aesni_inc_init:	internal ABI
 *	setup registers used by _aesni_inc
 * input:
 *	IV
 * output:
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 */
2366
.align 4
2367 2368 2369 2370 2371
_aesni_inc_init:
	movaps .Lbswap_mask, BSWAP_MASK
	movaps IV, CTR
	PSHUFB_XMM BSWAP_MASK CTR
	mov $1, TCTR_LOW
2372 2373
	MOVQ_R64_XMM TCTR_LOW INC
	MOVQ_R64_XMM CTR TCTR_LOW
2374
	ret
2375
ENDPROC(_aesni_inc_init)
2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391

/*
 * _aesni_inc:		internal ABI
 *	Increase IV by 1, IV is in big endian
 * input:
 *	IV
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 * output:
 *	IV:	Increase by 1
 * changed:
 *	CTR:	== output IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 */
2392
.align 4
2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403
_aesni_inc:
	paddq INC, CTR
	add $1, TCTR_LOW
	jnc .Linc_low
	pslldq $8, INC
	paddq INC, CTR
	psrldq $8, INC
.Linc_low:
	movaps CTR, IV
	PSHUFB_XMM BSWAP_MASK IV
	ret
2404
ENDPROC(_aesni_inc)
2405 2406 2407 2408 2409 2410

/*
 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_ctr_enc)
2411
	FRAME_BEGIN
2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464
	cmp $16, LEN
	jb .Lctr_enc_just_ret
	mov 480(KEYP), KLEN
	movups (IVP), IV
	call _aesni_inc_init
	cmp $64, LEN
	jb .Lctr_enc_loop1
.align 4
.Lctr_enc_loop4:
	movaps IV, STATE1
	call _aesni_inc
	movups (INP), IN1
	movaps IV, STATE2
	call _aesni_inc
	movups 0x10(INP), IN2
	movaps IV, STATE3
	call _aesni_inc
	movups 0x20(INP), IN3
	movaps IV, STATE4
	call _aesni_inc
	movups 0x30(INP), IN4
	call _aesni_enc4
	pxor IN1, STATE1
	movups STATE1, (OUTP)
	pxor IN2, STATE2
	movups STATE2, 0x10(OUTP)
	pxor IN3, STATE3
	movups STATE3, 0x20(OUTP)
	pxor IN4, STATE4
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lctr_enc_loop4
	cmp $16, LEN
	jb .Lctr_enc_ret
.align 4
.Lctr_enc_loop1:
	movaps IV, STATE
	call _aesni_inc
	movups (INP), IN
	call _aesni_enc1
	pxor IN, STATE
	movups STATE, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lctr_enc_loop1
.Lctr_enc_ret:
	movups IV, (IVP)
.Lctr_enc_just_ret:
2465
	FRAME_END
2466
	ret
2467
ENDPROC(aesni_ctr_enc)
2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491

/*
 * _aesni_gf128mul_x_ble:		internal ABI
 *	Multiply in GF(2^128) for XTS IVs
 * input:
 *	IV:	current IV
 *	GF128MUL_MASK == mask with 0x87 and 0x01
 * output:
 *	IV:	next IV
 * changed:
 *	CTR:	== temporary value
 */
#define _aesni_gf128mul_x_ble() \
	pshufd $0x13, IV, CTR; \
	paddq IV, IV; \
	psrad $31, CTR; \
	pand GF128MUL_MASK, CTR; \
	pxor CTR, IV;

/*
 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *			 bool enc, u8 *iv)
 */
ENTRY(aesni_xts_crypt8)
2492
	FRAME_BEGIN
2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507
	cmpb $0, %cl
	movl $0, %ecx
	movl $240, %r10d
	leaq _aesni_enc4, %r11
	leaq _aesni_dec4, %rax
	cmovel %r10d, %ecx
	cmoveq %rax, %r11

	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
	movups (IVP), IV

	mov 480(KEYP), KLEN
	addq %rcx, KEYP

	movdqa IV, STATE1
2508 2509
	movdqu 0x00(INP), INC
	pxor INC, STATE1
2510 2511 2512 2513
	movdqu IV, 0x00(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE2
2514 2515
	movdqu 0x10(INP), INC
	pxor INC, STATE2
2516 2517 2518 2519
	movdqu IV, 0x10(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE3
2520 2521
	movdqu 0x20(INP), INC
	pxor INC, STATE3
2522 2523 2524 2525
	movdqu IV, 0x20(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE4
2526 2527
	movdqu 0x30(INP), INC
	pxor INC, STATE4
2528 2529
	movdqu IV, 0x30(OUTP)

2530
	CALL_NOSPEC %r11
2531

2532 2533
	movdqu 0x00(OUTP), INC
	pxor INC, STATE1
2534 2535 2536 2537
	movdqu STATE1, 0x00(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE1
2538 2539
	movdqu 0x40(INP), INC
	pxor INC, STATE1
2540 2541
	movdqu IV, 0x40(OUTP)

2542 2543
	movdqu 0x10(OUTP), INC
	pxor INC, STATE2
2544 2545 2546 2547
	movdqu STATE2, 0x10(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE2
2548 2549
	movdqu 0x50(INP), INC
	pxor INC, STATE2
2550 2551
	movdqu IV, 0x50(OUTP)

2552 2553
	movdqu 0x20(OUTP), INC
	pxor INC, STATE3
2554 2555 2556 2557
	movdqu STATE3, 0x20(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE3
2558 2559
	movdqu 0x60(INP), INC
	pxor INC, STATE3
2560 2561
	movdqu IV, 0x60(OUTP)

2562 2563
	movdqu 0x30(OUTP), INC
	pxor INC, STATE4
2564 2565 2566 2567
	movdqu STATE4, 0x30(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE4
2568 2569
	movdqu 0x70(INP), INC
	pxor INC, STATE4
2570 2571 2572 2573 2574
	movdqu IV, 0x70(OUTP)

	_aesni_gf128mul_x_ble()
	movups IV, (IVP)

2575
	CALL_NOSPEC %r11
2576

2577 2578
	movdqu 0x40(OUTP), INC
	pxor INC, STATE1
2579 2580
	movdqu STATE1, 0x40(OUTP)

2581 2582
	movdqu 0x50(OUTP), INC
	pxor INC, STATE2
2583 2584
	movdqu STATE2, 0x50(OUTP)

2585 2586
	movdqu 0x60(OUTP), INC
	pxor INC, STATE3
2587 2588
	movdqu STATE3, 0x60(OUTP)

2589 2590
	movdqu 0x70(OUTP), INC
	pxor INC, STATE4
2591 2592
	movdqu STATE4, 0x70(OUTP)

2593
	FRAME_END
2594 2595 2596
	ret
ENDPROC(aesni_xts_crypt8)

2597
#endif