aesni-intel_asm.S 76.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 * Implement AES algorithm in Intel AES-NI instructions.
 *
 * The white paper of AES-NI instructions can be downloaded from:
 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
 *
 * Copyright (C) 2008, Intel Corp.
 *    Author: Huang Ying <ying.huang@intel.com>
 *            Vinodh Gopal <vinodh.gopal@intel.com>
 *            Kahraman Akdemir
 *
12 13 14 15 16 17 18 19 20 21 22
 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
 * interface for 64-bit kernels.
 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
 *             Adrian Hoban <adrian.hoban@intel.com>
 *             James Guilford (james.guilford@intel.com)
 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
 *             Tadeusz Struk (tadeusz.struk@intel.com)
 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
 *    Copyright (c) 2010, Intel Corporation.
 *
23 24 25
 * Ported x86_64 version to x86:
 *    Author: Mathias Krause <minipli@googlemail.com>
 *
26 27 28 29 30 31 32
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */

#include <linux/linkage.h>
33
#include <asm/inst.h>
34
#include <asm/frame.h>
35
#include <asm/nospec-branch.h>
36

37 38 39 40 41 42 43 44 45 46 47
/*
 * The following macros are used to move an (un)aligned 16 byte value to/from
 * an XMM register.  This can done for either FP or integer values, for FP use
 * movaps (move aligned packed single) or integer use movdqa (move double quad
 * aligned).  It doesn't make a performance difference which instruction is used
 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
 * shorter, so that is the one we'll use for now. (same for unaligned).
 */
#define MOVADQ	movaps
#define MOVUDQ	movups

48
#ifdef __x86_64__
49

50 51
# constants in mergeable sections, linker can reorder and merge
.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52 53 54
.align 16
.Lgf128mul_x_ble_mask:
	.octa 0x00000000000000010000000000000087
55 56
.section	.rodata.cst16.POLY, "aM", @progbits, 16
.align 16
57
POLY:   .octa 0xC2000000000000000000000000000001
58 59
.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
.align 16
60 61
TWOONE: .octa 0x00000001000000000000000000000001

62 63
.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
.align 16
64
SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
65 66
.section	.rodata.cst16.MASK1, "aM", @progbits, 16
.align 16
67
MASK1:      .octa 0x0000000000000000ffffffffffffffff
68 69
.section	.rodata.cst16.MASK2, "aM", @progbits, 16
.align 16
70
MASK2:      .octa 0xffffffffffffffff0000000000000000
71 72
.section	.rodata.cst16.ONE, "aM", @progbits, 16
.align 16
73
ONE:        .octa 0x00000000000000000000000000000001
74 75
.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
.align 16
76
F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 78
.section	.rodata.cst16.dec, "aM", @progbits, 16
.align 16
79
dec:        .octa 0x1
80 81
.section	.rodata.cst16.enc, "aM", @progbits, 16
.align 16
82 83
enc:        .octa 0x2

84 85 86 87 88 89 90 91 92
# order of these constants should not change.
# more specifically, ALL_F should follow SHIFT_MASK,
# and zero should follow ALL_F
.section	.rodata, "a", @progbits
.align 16
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
            .octa 0x00000000000000000000000000000000

93 94
.text

95 96 97

#define	STACK_OFFSET    8*3

98 99 100 101 102 103 104
#define AadHash 16*0
#define AadLen 16*1
#define InLen (16*1)+8
#define PBlockEncKey 16*2
#define OrigIV 16*3
#define CurCount 16*4
#define PBlockLen 16*5
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
#define	HashKey		16*6	// store HashKey <<1 mod poly here
#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
				// bits of  HashKey <<1 mod poly here
				//(for Karatsuba purposes)
#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^2 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^3 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^4 <<1 mod poly here
				// (for Karatsuba purposes)
121

122 123 124 125 126 127
#define arg1 rdi
#define arg2 rsi
#define arg3 rdx
#define arg4 rcx
#define arg5 r8
#define arg6 r9
128 129 130 131 132
#define arg7 STACK_OFFSET+8(%rsp)
#define arg8 STACK_OFFSET+16(%rsp)
#define arg9 STACK_OFFSET+24(%rsp)
#define arg10 STACK_OFFSET+32(%rsp)
#define arg11 STACK_OFFSET+40(%rsp)
133
#define keysize 2*15*16(%arg1)
134
#endif
135 136


137 138 139 140 141 142 143 144 145 146 147 148
#define STATE1	%xmm0
#define STATE2	%xmm4
#define STATE3	%xmm5
#define STATE4	%xmm6
#define STATE	STATE1
#define IN1	%xmm1
#define IN2	%xmm7
#define IN3	%xmm8
#define IN4	%xmm9
#define IN	IN1
#define KEY	%xmm2
#define IV	%xmm3
149

150 151 152
#define BSWAP_MASK %xmm10
#define CTR	%xmm11
#define INC	%xmm12
153

154 155
#define GF128MUL_MASK %xmm10

156 157
#ifdef __x86_64__
#define AREG	%rax
158 159
#define KEYP	%rdi
#define OUTP	%rsi
160
#define UKEYP	OUTP
161 162 163 164 165 166 167
#define INP	%rdx
#define LEN	%rcx
#define IVP	%r8
#define KLEN	%r9d
#define T1	%r10
#define TKEYP	T1
#define T2	%r11
168
#define TCTR_LOW T2
169 170 171 172 173 174 175 176 177 178 179 180
#else
#define AREG	%eax
#define KEYP	%edi
#define OUTP	AREG
#define UKEYP	OUTP
#define INP	%edx
#define LEN	%esi
#define IVP	%ebp
#define KLEN	%ebx
#define T1	%ecx
#define TKEYP	T1
#endif
181

182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
.macro FUNC_SAVE
	push	%r12
	push	%r13
	push	%r14
#
# states of %xmm registers %xmm6:%xmm15 not saved
# all %xmm registers are clobbered
#
.endm


.macro FUNC_RESTORE
	pop	%r14
	pop	%r13
	pop	%r12
.endm
198

199 200 201 202 203
# Precompute hashkeys.
# Input: Hash subkey.
# Output: HashKeys stored in gcm_context_data.  Only needs to be called
# once per key.
# clobbers r12, and tmp xmm registers.
204 205
.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
	mov	\SUBKEY, %r12
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
	movdqu	(%r12), \TMP3
	movdqa	SHUF_MASK(%rip), \TMP2
	PSHUFB_XMM \TMP2, \TMP3

	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)

	movdqa	\TMP3, \TMP2
	psllq	$1, \TMP3
	psrlq	$63, \TMP2
	movdqa	\TMP2, \TMP1
	pslldq	$8, \TMP2
	psrldq	$8, \TMP1
	por	\TMP2, \TMP3

	# reduce HashKey<<1

	pshufd	$0x24, \TMP1, \TMP2
	pcmpeqd TWOONE(%rip), \TMP2
	pand	POLY(%rip), \TMP2
	pxor	\TMP2, \TMP3
226
	movdqu	\TMP3, HashKey(%arg2)
227 228 229 230

	movdqa	   \TMP3, \TMP5
	pshufd	   $78, \TMP3, \TMP1
	pxor	   \TMP3, \TMP1
231
	movdqu	   \TMP1, HashKey_k(%arg2)
232 233 234

	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
235
	movdqu	   \TMP5, HashKey_2(%arg2)
236 237 238
# HashKey_2 = HashKey^2<<1 (mod poly)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
239
	movdqu	   \TMP1, HashKey_2_k(%arg2)
240 241 242

	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
243
	movdqu	   \TMP5, HashKey_3(%arg2)
244 245
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
246
	movdqu	   \TMP1, HashKey_3_k(%arg2)
247 248 249

	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
250
	movdqu	   \TMP5, HashKey_4(%arg2)
251 252
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
253
	movdqu	   \TMP1, HashKey_4_k(%arg2)
254
.endm
D
Dave Watson 已提交
255 256 257

# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
258 259
.macro GCM_INIT Iv SUBKEY AAD AADLEN
	mov \AADLEN, %r11
260 261 262 263 264
	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
	xor %r11, %r11
	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
265
	mov \Iv, %rax
266 267 268 269 270 271 272
	movdqu (%rax), %xmm0
	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv

	movdqa  SHUF_MASK(%rip), %xmm2
	PSHUFB_XMM %xmm2, %xmm0
	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv

273
	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
274
	movdqu HashKey(%arg2), %xmm13
275

276 277
	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
	%xmm4, %xmm5, %xmm6
D
Dave Watson 已提交
278 279
.endm

280 281 282 283 284
# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
# struct has been initialized by GCM_INIT.
# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
# Clobbers rax, r10-r13, and xmm0-xmm15
.macro GCM_ENC_DEC operation
285
	movdqu AadHash(%arg2), %xmm8
286
	movdqu HashKey(%arg2), %xmm13
287
	add %arg5, InLen(%arg2)
288 289 290 291 292

	xor %r11, %r11 # initialise the data pointer offset as zero
	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation

	sub %r11, %arg5		# sub partial block data used
293
	mov %arg5, %r13		# save the number of bytes
294

295 296
	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
	mov %r13, %r12
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
	# Encrypt/Decrypt first few blocks

	and	$(3<<4), %r12
	jz	_initial_num_blocks_is_0_\@
	cmp	$(2<<4), %r12
	jb	_initial_num_blocks_is_1_\@
	je	_initial_num_blocks_is_2_\@
_initial_num_blocks_is_3_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
	sub	$48, %r13
	jmp	_initial_blocks_\@
_initial_num_blocks_is_2_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
	sub	$32, %r13
	jmp	_initial_blocks_\@
_initial_num_blocks_is_1_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
	sub	$16, %r13
	jmp	_initial_blocks_\@
_initial_num_blocks_is_0_\@:
	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
_initial_blocks_\@:

	# Main loop - Encrypt/Decrypt remaining blocks

	cmp	$0, %r13
	je	_zero_cipher_left_\@
	sub	$64, %r13
	je	_four_cipher_left_\@
_crypt_by_4_\@:
	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
	%xmm7, %xmm8, enc
	add	$64, %r11
	sub	$64, %r13
	jne	_crypt_by_4_\@
_four_cipher_left_\@:
	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
_zero_cipher_left_\@:
341 342 343
	movdqu %xmm8, AadHash(%arg2)
	movdqu %xmm0, CurCount(%arg2)

344 345
	mov	%arg5, %r13
	and	$15, %r13			# %r13 = arg5 (mod 16)
346 347
	je	_multiple_of_16_bytes_\@

348 349
	mov %r13, PBlockLen(%arg2)

350 351
	# Handle the last <16 Byte block separately
	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
352
	movdqu %xmm0, CurCount(%arg2)
353
	movdqa SHUF_MASK(%rip), %xmm10
354 355 356
	PSHUFB_XMM %xmm10, %xmm0

	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
357
	movdqu %xmm0, PBlockEncKey(%arg2)
358

359 360 361
	cmp	$16, %arg5
	jge _large_enough_update_\@

362
	lea (%arg4,%r11,1), %r10
363 364
	mov %r13, %r12
	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
365 366 367 368 369 370 371 372
	jmp _data_read_\@

_large_enough_update_\@:
	sub	$16, %r11
	add	%r13, %r11

	# receive the last <16 Byte block
	movdqu	(%arg4, %r11, 1), %xmm1
373

374 375 376 377 378 379 380 381 382 383 384 385 386
	sub	%r13, %r11
	add	$16, %r11

	lea	SHIFT_MASK+16(%rip), %r12
	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
	# (r13 is the number of bytes in plaintext mod 16)
	sub	%r13, %r12
	# get the appropriate shuffle mask
	movdqu	(%r12), %xmm2
	# shift right 16-r13 bytes
	PSHUFB_XMM  %xmm2, %xmm1

_data_read_\@:
387 388
	lea ALL_F+16(%rip), %r12
	sub %r13, %r12
389

390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409
.ifc \operation, dec
	movdqa  %xmm1, %xmm2
.endif
	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
	movdqu	(%r12), %xmm1
	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
.ifc \operation, dec
	pand    %xmm1, %xmm2
	movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10 ,%xmm2

	pxor %xmm2, %xmm8
.else
	movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10,%xmm0

	pxor	%xmm0, %xmm8
.endif

410
	movdqu %xmm8, AadHash(%arg2)
411 412 413 414 415 416 417 418 419 420 421
.ifc \operation, enc
	# GHASH computation for the last <16 byte block
	movdqa SHUF_MASK(%rip), %xmm10
	# shuffle xmm0 back to output as ciphertext
	PSHUFB_XMM %xmm10, %xmm0
.endif

	# Output %r13 bytes
	MOVQ_R64_XMM %xmm0, %rax
	cmp $8, %r13
	jle _less_than_8_bytes_left_\@
422
	mov %rax, (%arg3 , %r11, 1)
423 424 425 426 427
	add $8, %r11
	psrldq $8, %xmm0
	MOVQ_R64_XMM %xmm0, %rax
	sub $8, %r13
_less_than_8_bytes_left_\@:
428
	mov %al,  (%arg3, %r11, 1)
429 430 431 432 433 434 435
	add $1, %r11
	shr $8, %rax
	sub $1, %r13
	jne _less_than_8_bytes_left_\@
_multiple_of_16_bytes_\@:
.endm

436 437 438
# GCM_COMPLETE Finishes update of tag of last partial block
# Output: Authorization Tag (AUTH_TAG)
# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
439
.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
440
	movdqu AadHash(%arg2), %xmm8
441
	movdqu HashKey(%arg2), %xmm13
442 443 444 445 446 447 448 449 450

	mov PBlockLen(%arg2), %r12

	cmp $0, %r12
	je _partial_done\@

	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6

_partial_done\@:
451
	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
452 453
	shl	$3, %r12		  # convert into number of bits
	movd	%r12d, %xmm15		  # len(A) in %xmm15
454 455 456 457
	mov InLen(%arg2), %r12
	shl     $3, %r12                  # len(C) in bits (*128)
	MOVQ_R64_XMM    %r12, %xmm1

458 459 460 461 462 463 464 465
	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
	pxor	%xmm15, %xmm8
	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	# final GHASH computation
	movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10, %xmm8

466
	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
467 468 469
	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
	pxor	%xmm8, %xmm0
_return_T_\@:
470 471
	mov	\AUTHTAG, %r10                     # %r10 = authTag
	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
	cmp	$16, %r11
	je	_T_16_\@
	cmp	$8, %r11
	jl	_T_4_\@
_T_8_\@:
	MOVQ_R64_XMM	%xmm0, %rax
	mov	%rax, (%r10)
	add	$8, %r10
	sub	$8, %r11
	psrldq	$8, %xmm0
	cmp	$0, %r11
	je	_return_T_done_\@
_T_4_\@:
	movd	%xmm0, %eax
	mov	%eax, (%r10)
	add	$4, %r10
	sub	$4, %r11
	psrldq	$4, %xmm0
	cmp	$0, %r11
	je	_return_T_done_\@
_T_123_\@:
	movd	%xmm0, %eax
	cmp	$2, %r11
	jl	_T_1_\@
	mov	%ax, (%r10)
	cmp	$2, %r11
	je	_return_T_done_\@
	add	$2, %r10
	sar	$16, %eax
_T_1_\@:
	mov	%al, (%r10)
	jmp	_return_T_done_\@
_T_16_\@:
	movdqu	%xmm0, (%r10)
_return_T_done_\@:
.endm

509
#ifdef __x86_64__
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
*
*
* Input: A and B (128-bits each, bit-reflected)
* Output: C = A*B*x mod poly, (i.e. >>1 )
* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
*
*/
.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
	movdqa	  \GH, \TMP1
	pshufd	  $78, \GH, \TMP2
	pshufd	  $78, \HK, \TMP3
	pxor	  \GH, \TMP2            # TMP2 = a1+a0
	pxor	  \HK, \TMP3            # TMP3 = b1+b0
	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
	pxor	  \GH, \TMP2
	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
	pxor	  \TMP3, \GH
	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK

        # first phase of the reduction

	movdqa    \GH, \TMP2
	movdqa    \GH, \TMP3
	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	pslld     $31, \TMP2            # packed right shift <<31
	pslld     $30, \TMP3            # packed right shift <<30
	pslld     $25, \TMP4            # packed right shift <<25
	pxor      \TMP3, \TMP2          # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5             # right shift TMP5 1 DW
	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
	pxor      \TMP2, \GH

        # second phase of the reduction

	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	movdqa    \GH,\TMP3
	movdqa    \GH,\TMP4
	psrld     $1,\TMP2              # packed left shift >>1
	psrld     $2,\TMP3              # packed left shift >>2
	psrld     $7,\TMP4              # packed left shift >>7
	pxor      \TMP3,\TMP2		# xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \GH
	pxor      \TMP1, \GH            # result is in TMP1
.endm

570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
# Reads DLEN bytes starting at DPTR and stores in XMMDst
# where 0 < DLEN < 16
# Clobbers %rax, DLEN and XMM1
.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
        cmp $8, \DLEN
        jl _read_lt8_\@
        mov (\DPTR), %rax
        MOVQ_R64_XMM %rax, \XMMDst
        sub $8, \DLEN
        jz _done_read_partial_block_\@
	xor %eax, %eax
_read_next_byte_\@:
        shl $8, %rax
        mov 7(\DPTR, \DLEN, 1), %al
        dec \DLEN
        jnz _read_next_byte_\@
        MOVQ_R64_XMM %rax, \XMM1
	pslldq $8, \XMM1
        por \XMM1, \XMMDst
	jmp _done_read_partial_block_\@
_read_lt8_\@:
	xor %eax, %eax
_read_next_byte_lt8_\@:
        shl $8, %rax
        mov -1(\DPTR, \DLEN, 1), %al
        dec \DLEN
        jnz _read_next_byte_lt8_\@
        MOVQ_R64_XMM %rax, \XMMDst
_done_read_partial_block_\@:
.endm

601 602
# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
# clobbers r10-11, xmm14
603
.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
604 605
	TMP6 TMP7
	MOVADQ	   SHUF_MASK(%rip), %xmm14
606 607
	mov	   \AAD, %r10		# %r10 = AAD
	mov	   \AADLEN, %r11		# %r11 = aadLen
608 609
	pxor	   \TMP7, \TMP7
	pxor	   \TMP6, \TMP6
610 611

	cmp	   $16, %r11
612 613
	jl	   _get_AAD_rest\@
_get_AAD_blocks\@:
614 615 616 617
	movdqu	   (%r10), \TMP7
	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
	pxor	   \TMP7, \TMP6
	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
618 619 620
	add	   $16, %r10
	sub	   $16, %r11
	cmp	   $16, %r11
621
	jge	   _get_AAD_blocks\@
622

623
	movdqu	   \TMP6, \TMP7
624 625

	/* read the last <16B of AAD */
626
_get_AAD_rest\@:
627
	cmp	   $0, %r11
628
	je	   _get_AAD_done\@
629

630 631 632 633 634
	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
	pxor	   \TMP6, \TMP7
	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
	movdqu \TMP7, \TMP6
635

636
_get_AAD_done\@:
637 638 639
	movdqu \TMP6, AadHash(%arg2)
.endm

640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783
# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
# between update calls.
# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
	AAD_HASH operation
	mov 	PBlockLen(%arg2), %r13
	cmp	$0, %r13
	je	_partial_block_done_\@	# Leave Macro if no partial blocks
	# Read in input data without over reading
	cmp	$16, \PLAIN_CYPH_LEN
	jl	_fewer_than_16_bytes_\@
	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
	jmp	_data_read_\@

_fewer_than_16_bytes_\@:
	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
	mov	\PLAIN_CYPH_LEN, %r12
	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1

	mov PBlockLen(%arg2), %r13

_data_read_\@:				# Finished reading in data

	movdqu	PBlockEncKey(%arg2), %xmm9
	movdqu	HashKey(%arg2), %xmm13

	lea	SHIFT_MASK(%rip), %r12

	# adjust the shuffle mask pointer to be able to shift r13 bytes
	# r16-r13 is the number of bytes in plaintext mod 16)
	add	%r13, %r12
	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
	PSHUFB_XMM %xmm2, %xmm9		# shift right r13 bytes

.ifc \operation, dec
	movdqa	%xmm1, %xmm3
	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)

	mov	\PLAIN_CYPH_LEN, %r10
	add	%r13, %r10
	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
	sub	$16, %r10
	# Determine if if partial block is not being filled and
	# shift mask accordingly
	jge	_no_extra_mask_1_\@
	sub	%r10, %r12
_no_extra_mask_1_\@:

	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
	# get the appropriate mask to mask out bottom r13 bytes of xmm9
	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9

	pand	%xmm1, %xmm3
	movdqa	SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM	%xmm10, %xmm3
	PSHUFB_XMM	%xmm2, %xmm3
	pxor	%xmm3, \AAD_HASH

	cmp	$0, %r10
	jl	_partial_incomplete_1_\@

	# GHASH computation for the last <16 Byte block
	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
	xor	%rax,%rax

	mov	%rax, PBlockLen(%arg2)
	jmp	_dec_done_\@
_partial_incomplete_1_\@:
	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
_dec_done_\@:
	movdqu	\AAD_HASH, AadHash(%arg2)
.else
	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)

	mov	\PLAIN_CYPH_LEN, %r10
	add	%r13, %r10
	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
	sub	$16, %r10
	# Determine if if partial block is not being filled and
	# shift mask accordingly
	jge	_no_extra_mask_2_\@
	sub	%r10, %r12
_no_extra_mask_2_\@:

	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
	# get the appropriate mask to mask out bottom r13 bytes of xmm9
	pand	%xmm1, %xmm9

	movdqa	SHUF_MASK(%rip), %xmm1
	PSHUFB_XMM %xmm1, %xmm9
	PSHUFB_XMM %xmm2, %xmm9
	pxor	%xmm9, \AAD_HASH

	cmp	$0, %r10
	jl	_partial_incomplete_2_\@

	# GHASH computation for the last <16 Byte block
	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
	xor	%rax,%rax

	mov	%rax, PBlockLen(%arg2)
	jmp	_encode_done_\@
_partial_incomplete_2_\@:
	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
_encode_done_\@:
	movdqu	\AAD_HASH, AadHash(%arg2)

	movdqa	SHUF_MASK(%rip), %xmm10
	# shuffle xmm9 back to output as ciphertext
	PSHUFB_XMM	%xmm10, %xmm9
	PSHUFB_XMM	%xmm2, %xmm9
.endif
	# output encrypted Bytes
	cmp	$0, %r10
	jl	_partial_fill_\@
	mov	%r13, %r12
	mov	$16, %r13
	# Set r13 to be the number of bytes to write out
	sub	%r12, %r13
	jmp	_count_set_\@
_partial_fill_\@:
	mov	\PLAIN_CYPH_LEN, %r13
_count_set_\@:
	movdqa	%xmm9, %xmm0
	MOVQ_R64_XMM	%xmm0, %rax
	cmp	$8, %r13
	jle	_less_than_8_bytes_left_\@

	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
	add	$8, \DATA_OFFSET
	psrldq	$8, %xmm0
	MOVQ_R64_XMM	%xmm0, %rax
	sub	$8, %r13
_less_than_8_bytes_left_\@:
	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
	add	$1, \DATA_OFFSET
	shr	$8, %rax
	sub	$1, %r13
	jne	_less_than_8_bytes_left_\@
_partial_block_done_\@:
.endm # PARTIAL_BLOCK

784 785 786 787 788 789 790 791
/*
* if a = number of total plaintext bytes
* b = floor(a/16)
* num_initial_blocks = b mod 4
* encrypt the initial num_initial_blocks blocks and apply ghash on
* the ciphertext
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
* are clobbered
792
* arg1, %arg2, %arg3 are used as a pointer only, not modified
793 794 795 796 797
*/


.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
798
	MOVADQ		SHUF_MASK(%rip), %xmm14
799 800 801

	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0

802
	# start AES for num_initial_blocks blocks
803

804
	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
805 806 807

.if (\i == 5) || (\i == 6) || (\i == 7)

808 809
	MOVADQ		ONE(%RIP),\TMP1
	MOVADQ		0(%arg1),\TMP2
810
.irpc index, \i_seq
811
	paddd		\TMP1, \XMM0                 # INCR Y0
812 813 814
.ifc \operation, dec
        movdqa     \XMM0, %xmm\index
.else
815
	MOVADQ		\XMM0, %xmm\index
816
.endif
817 818
	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
	pxor		\TMP2, %xmm\index
819
.endr
820 821 822 823 824
	lea	0x10(%arg1),%r10
	mov	keysize,%eax
	shr	$2,%eax				# 128->4, 192->6, 256->8
	add	$5,%eax			      # 128->9, 192->11, 256->13

825
aes_loop_initial_\@:
826 827 828
	MOVADQ	(%r10),\TMP1
.irpc	index, \i_seq
	AESENC	\TMP1, %xmm\index
829
.endr
830 831
	add	$16,%r10
	sub	$1,%eax
832
	jnz	aes_loop_initial_\@
833 834

	MOVADQ	(%r10), \TMP1
835
.irpc index, \i_seq
836
	AESENCLAST \TMP1, %xmm\index         # Last Round
837 838
.endr
.irpc index, \i_seq
839
	movdqu	   (%arg4 , %r11, 1), \TMP1
840
	pxor	   \TMP1, %xmm\index
841
	movdqu	   %xmm\index, (%arg3 , %r11, 1)
842 843
	# write back plaintext/ciphertext for num_initial_blocks
	add	   $16, %r11
844 845 846 847

.ifc \operation, dec
	movdqa     \TMP1, %xmm\index
.endif
848 849 850 851 852
	PSHUFB_XMM	   %xmm14, %xmm\index

		# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
853

854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
        # apply GHASH on num_initial_blocks blocks

.if \i == 5
        pxor       %xmm5, %xmm6
	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 6
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 7
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
	cmp	   $64, %r13
873
	jl	_initial_blocks_done\@
874 875 876 877 878 879
	# no need for precomputed values
/*
*
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
880 881 882
	MOVADQ	   ONE(%RIP),\TMP1
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM1
883 884
	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap

885 886
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM2
887 888
	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap

889 890
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM3
891 892
	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap

893 894
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM4
895 896
	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap

897 898 899 900 901
	MOVADQ	   0(%arg1),\TMP1
	pxor	   \TMP1, \XMM1
	pxor	   \TMP1, \XMM2
	pxor	   \TMP1, \XMM3
	pxor	   \TMP1, \XMM4
902 903 904 905 906 907 908 909 910 911 912 913 914 915
.irpc index, 1234 # do 4 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
.irpc index, 56789 # do next 5 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
916 917 918 919
	lea	   0xa0(%arg1),%r10
	mov	   keysize,%eax
	shr	   $2,%eax			# 128->4, 192->6, 256->8
	sub	   $4,%eax			# 128->0, 192->2, 256->4
920
	jz	   aes_loop_pre_done\@
921

922
aes_loop_pre_\@:
923 924 925 926 927 928
	MOVADQ	   (%r10),\TMP2
.irpc	index, 1234
	AESENC	   \TMP2, %xmm\index
.endr
	add	   $16,%r10
	sub	   $1,%eax
929
	jnz	   aes_loop_pre_\@
930

931
aes_loop_pre_done\@:
932
	MOVADQ	   (%r10), \TMP2
933 934 935 936
	AESENCLAST \TMP2, \XMM1
	AESENCLAST \TMP2, \XMM2
	AESENCLAST \TMP2, \XMM3
	AESENCLAST \TMP2, \XMM4
937
	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
938
	pxor	   \TMP1, \XMM1
939
.ifc \operation, dec
940
	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
941 942
	movdqa     \TMP1, \XMM1
.endif
943
	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
944
	pxor	   \TMP1, \XMM2
945
.ifc \operation, dec
946
	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
947 948
	movdqa     \TMP1, \XMM2
.endif
949
	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
950
	pxor	   \TMP1, \XMM3
951
.ifc \operation, dec
952
	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
953 954
	movdqa     \TMP1, \XMM3
.endif
955
	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
956
	pxor	   \TMP1, \XMM4
957
.ifc \operation, dec
958
	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
959 960
	movdqa     \TMP1, \XMM4
.else
961 962 963 964
	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
965
.endif
966

967
	add	   $64, %r11
968
	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
969 970
	pxor	   \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
971 972 973 974
	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap

975
_initial_blocks_done\@:
976

977 978 979 980 981
.endm

/*
* encrypt 4 blocks at a time
* ghash the 4 previously encrypted ciphertext blocks
982
* arg1, %arg3, %arg4 are used as pointers only, not modified
983 984
* %r11 is the data offset value
*/
985 986 987 988 989 990 991 992 993 994 995 996 997 998 999
.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation

	movdqa	  \XMM1, \XMM5
	movdqa	  \XMM2, \XMM6
	movdqa	  \XMM3, \XMM7
	movdqa	  \XMM4, \XMM8

        movdqa    SHUF_MASK(%rip), %xmm15
        # multiply TMP5 * HashKey using karatsuba

	movdqa	  \XMM5, \TMP4
	pshufd	  $78, \XMM5, \TMP6
	pxor	  \XMM5, \TMP6
	paddd     ONE(%rip), \XMM0		# INCR CNT
1000
	movdqu	  HashKey_4(%arg2), \TMP5
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
	movdqa    \XMM0, \XMM1
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM2
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM3
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM4
	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

	pxor	  (%arg1), \XMM1
	pxor	  (%arg1), \XMM2
	pxor	  (%arg1), \XMM3
	pxor	  (%arg1), \XMM4
1019
	movdqu	  HashKey_4_k(%arg2), \TMP5
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
	movaps 0x10(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 1
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movaps 0x20(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 2
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movdqa	  \XMM6, \TMP1
	pshufd	  $78, \XMM6, \TMP2
	pxor	  \XMM6, \TMP2
1034
	movdqu	  HashKey_3(%arg2), \TMP5
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
	movaps 0x30(%arg1), \TMP3
	AESENC    \TMP3, \XMM1              # Round 3
	AESENC    \TMP3, \XMM2
	AESENC    \TMP3, \XMM3
	AESENC    \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
	movaps 0x40(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 4
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
1047
	movdqu	  HashKey_3_k(%arg2), \TMP5
1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x50(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 5
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM6, \XMM5
	pxor	  \TMP2, \TMP6
	movdqa	  \XMM7, \TMP1
	pshufd	  $78, \XMM7, \TMP2
	pxor	  \XMM7, \TMP2
1061
	movdqu	  HashKey_2(%arg2), \TMP5
1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076

        # Multiply TMP5 * HashKey using karatsuba

	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
	movaps 0x60(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 6
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
	movaps 0x70(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 7
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
1077
	movdqu	  HashKey_2_k(%arg2), \TMP5
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x80(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 8
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM7, \XMM5
	pxor	  \TMP2, \TMP6

        # Multiply XMM8 * HashKey
        # XMM8 and TMP5 hold the values for the two operands

	movdqa	  \XMM8, \TMP1
	pshufd	  $78, \XMM8, \TMP2
	pxor	  \XMM8, \TMP2
1095
	movdqu	  HashKey(%arg2), \TMP5
1096 1097 1098 1099 1100 1101 1102
	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
	movaps 0x90(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1            # Round 9
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1103 1104 1105 1106
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax			# 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
1107
	jz	  aes_loop_par_enc_done\@
1108

1109
aes_loop_par_enc\@:
1110 1111 1112 1113 1114 1115
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
1116
	jnz	  aes_loop_par_enc\@
1117

1118
aes_loop_par_enc_done\@:
1119
	MOVADQ	  (%r10), \TMP3
1120 1121 1122 1123
	AESENCLAST \TMP3, \XMM1           # Round 10
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
1124
	movdqu    HashKey_k(%arg2), \TMP5
1125
	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1126
	movdqu	  (%arg4,%r11,1), \TMP3
1127
	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1128
	movdqu	  16(%arg4,%r11,1), \TMP3
1129
	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1130
	movdqu	  32(%arg4,%r11,1), \TMP3
1131
	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1132
	movdqu	  48(%arg4,%r11,1), \TMP3
1133
	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1134 1135 1136 1137
        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

	pxor	  \TMP4, \TMP1
	pxor	  \XMM8, \XMM5
	pxor	  \TMP6, \TMP2
	pxor	  \TMP1, \TMP2
	pxor	  \XMM5, \TMP2
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
	pxor	  \TMP3, \XMM5
	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5

        # first phase of reduction

	movdqa    \XMM5, \TMP2
	movdqa    \XMM5, \TMP3
	movdqa    \XMM5, \TMP4
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
	pslld     $31, \TMP2                   # packed right shift << 31
	pslld     $30, \TMP3                   # packed right shift << 30
	pslld     $25, \TMP4                   # packed right shift << 25
	pxor      \TMP3, \TMP2	               # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5                    # right shift T5 1 DW
	pslldq    $12, \TMP2                   # left shift T2 3 DWs
	pxor      \TMP2, \XMM5

        # second phase of reduction

	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
	movdqa    \XMM5,\TMP3
	movdqa    \XMM5,\TMP4
	psrld     $1, \TMP2                    # packed left shift >>1
	psrld     $2, \TMP3                    # packed left shift >>2
	psrld     $7, \TMP4                    # packed left shift >>7
	pxor      \TMP3,\TMP2		       # xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \XMM5
	pxor      \TMP1, \XMM5                 # result is in TMP1

	pxor	  \XMM5, \XMM1
.endm

/*
* decrypt 4 blocks at a time
* ghash the 4 previously decrypted ciphertext blocks
1190
* arg1, %arg3, %arg4 are used as pointers only, not modified
1191 1192 1193
* %r11 is the data offset value
*/
.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194 1195 1196 1197 1198 1199 1200
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation

	movdqa	  \XMM1, \XMM5
	movdqa	  \XMM2, \XMM6
	movdqa	  \XMM3, \XMM7
	movdqa	  \XMM4, \XMM8

1201
        movdqa    SHUF_MASK(%rip), %xmm15
1202 1203 1204 1205 1206 1207
        # multiply TMP5 * HashKey using karatsuba

	movdqa	  \XMM5, \TMP4
	pshufd	  $78, \XMM5, \TMP6
	pxor	  \XMM5, \TMP6
	paddd     ONE(%rip), \XMM0		# INCR CNT
1208
	movdqu	  HashKey_4(%arg2), \TMP5
1209 1210 1211 1212 1213 1214 1215 1216
	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
	movdqa    \XMM0, \XMM1
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM2
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM3
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM4
1217
	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1218
	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1219 1220 1221 1222
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

1223 1224 1225 1226
	pxor	  (%arg1), \XMM1
	pxor	  (%arg1), \XMM2
	pxor	  (%arg1), \XMM3
	pxor	  (%arg1), \XMM4
1227
	movdqu	  HashKey_4_k(%arg2), \TMP5
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241
	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
	movaps 0x10(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 1
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movaps 0x20(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 2
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movdqa	  \XMM6, \TMP1
	pshufd	  $78, \XMM6, \TMP2
	pxor	  \XMM6, \TMP2
1242
	movdqu	  HashKey_3(%arg2), \TMP5
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254
	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
	movaps 0x30(%arg1), \TMP3
	AESENC    \TMP3, \XMM1              # Round 3
	AESENC    \TMP3, \XMM2
	AESENC    \TMP3, \XMM3
	AESENC    \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
	movaps 0x40(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 4
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
1255
	movdqu	  HashKey_3_k(%arg2), \TMP5
1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x50(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 5
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM6, \XMM5
	pxor	  \TMP2, \TMP6
	movdqa	  \XMM7, \TMP1
	pshufd	  $78, \XMM7, \TMP2
	pxor	  \XMM7, \TMP2
1269
	movdqu	  HashKey_2(%arg2), \TMP5
1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284

        # Multiply TMP5 * HashKey using karatsuba

	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
	movaps 0x60(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 6
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
	movaps 0x70(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 7
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
1285
	movdqu	  HashKey_2_k(%arg2), \TMP5
1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x80(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 8
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM7, \XMM5
	pxor	  \TMP2, \TMP6

        # Multiply XMM8 * HashKey
        # XMM8 and TMP5 hold the values for the two operands

	movdqa	  \XMM8, \TMP1
	pshufd	  $78, \XMM8, \TMP2
	pxor	  \XMM8, \TMP2
1303
	movdqu	  HashKey(%arg2), \TMP5
1304 1305 1306 1307 1308 1309 1310
	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
	movaps 0x90(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1            # Round 9
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1311 1312 1313 1314
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax		        # 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
1315
	jz	  aes_loop_par_dec_done\@
1316

1317
aes_loop_par_dec\@:
1318 1319 1320 1321 1322 1323
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
1324
	jnz	  aes_loop_par_dec\@
1325

1326
aes_loop_par_dec_done\@:
1327 1328
	MOVADQ	  (%r10), \TMP3
	AESENCLAST \TMP3, \XMM1           # last round
1329 1330 1331
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
1332
	movdqu    HashKey_k(%arg2), \TMP5
1333
	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1334
	movdqu	  (%arg4,%r11,1), \TMP3
1335
	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1336
	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1337
	movdqa    \TMP3, \XMM1
1338
	movdqu	  16(%arg4,%r11,1), \TMP3
1339
	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1340
	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1341
	movdqa    \TMP3, \XMM2
1342
	movdqu	  32(%arg4,%r11,1), \TMP3
1343
	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1344
	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1345
	movdqa    \TMP3, \XMM3
1346
	movdqu	  48(%arg4,%r11,1), \TMP3
1347
	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1348
	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1349
	movdqa    \TMP3, \XMM4
1350 1351 1352 1353
	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407

	pxor	  \TMP4, \TMP1
	pxor	  \XMM8, \XMM5
	pxor	  \TMP6, \TMP2
	pxor	  \TMP1, \TMP2
	pxor	  \XMM5, \TMP2
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
	pxor	  \TMP3, \XMM5
	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5

        # first phase of reduction

	movdqa    \XMM5, \TMP2
	movdqa    \XMM5, \TMP3
	movdqa    \XMM5, \TMP4
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
	pslld     $31, \TMP2                   # packed right shift << 31
	pslld     $30, \TMP3                   # packed right shift << 30
	pslld     $25, \TMP4                   # packed right shift << 25
	pxor      \TMP3, \TMP2	               # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5                    # right shift T5 1 DW
	pslldq    $12, \TMP2                   # left shift T2 3 DWs
	pxor      \TMP2, \XMM5

        # second phase of reduction

	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
	movdqa    \XMM5,\TMP3
	movdqa    \XMM5,\TMP4
	psrld     $1, \TMP2                    # packed left shift >>1
	psrld     $2, \TMP3                    # packed left shift >>2
	psrld     $7, \TMP4                    # packed left shift >>7
	pxor      \TMP3,\TMP2		       # xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \XMM5
	pxor      \TMP1, \XMM5                 # result is in TMP1

	pxor	  \XMM5, \XMM1
.endm

/* GHASH the last 4 ciphertext blocks. */
.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst

        # Multiply TMP6 * HashKey (using Karatsuba)

	movdqa	  \XMM1, \TMP6
	pshufd	  $78, \XMM1, \TMP2
	pxor	  \XMM1, \TMP2
1408
	movdqu	  HashKey_4(%arg2), \TMP5
1409 1410
	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1411
	movdqu	  HashKey_4_k(%arg2), \TMP4
1412 1413 1414 1415 1416 1417 1418 1419 1420
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	movdqa	  \XMM1, \XMMDst
	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM2, \TMP1
	pshufd	  $78, \XMM2, \TMP2
	pxor	  \XMM2, \TMP2
1421
	movdqu	  HashKey_3(%arg2), \TMP5
1422 1423
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1424
	movdqu	  HashKey_3_k(%arg2), \TMP4
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM2, \XMMDst
	pxor	  \TMP2, \XMM1
# results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM3, \TMP1
	pshufd	  $78, \XMM3, \TMP2
	pxor	  \XMM3, \TMP2
1436
	movdqu	  HashKey_2(%arg2), \TMP5
1437 1438
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1439
	movdqu	  HashKey_2_k(%arg2), \TMP4
1440 1441 1442 1443 1444 1445 1446 1447 1448
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM3, \XMMDst
	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)
	movdqa	  \XMM4, \TMP1
	pshufd	  $78, \XMM4, \TMP2
	pxor	  \XMM4, \TMP2
1449
	movdqu	  HashKey(%arg2), \TMP5
1450 1451
	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1452
	movdqu	  HashKey_k(%arg2), \TMP4
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM4, \XMMDst
	pxor	  \XMM1, \TMP2
	pxor	  \TMP6, \TMP2
	pxor	  \XMMDst, \TMP2
	# middle section of the temp results combined as in karatsuba algorithm
	movdqa	  \TMP2, \TMP4
	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
	pxor	  \TMP4, \XMMDst
	pxor	  \TMP2, \TMP6
# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
	# first phase of the reduction
	movdqa    \XMMDst, \TMP2
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
	pslld     $31, \TMP2                # packed right shifting << 31
	pslld     $30, \TMP3                # packed right shifting << 30
	pslld     $25, \TMP4                # packed right shifting << 25
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP7
	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
	pxor      \TMP2, \XMMDst

        # second phase of the reduction
	movdqa    \XMMDst, \TMP2
	# make 3 copies of XMMDst for doing 3 shift operations
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
	psrld     $1, \TMP2                 # packed left shift >> 1
	psrld     $2, \TMP3                 # packed left shift >> 2
	psrld     $7, \TMP4                 # packed left shift >> 7
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	pxor      \TMP7, \TMP2
	pxor      \TMP2, \XMMDst
	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
.endm


1497 1498 1499
/* Encryption of a single block
* uses eax & r10
*/
1500

1501
.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502

1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518
	pxor		(%arg1), \XMM0
	mov		keysize,%eax
	shr		$2,%eax			# 128->4, 192->6, 256->8
	add		$5,%eax			# 128->9, 192->11, 256->13
	lea		16(%arg1), %r10	  # get first expanded key address

_esb_loop_\@:
	MOVADQ		(%r10),\TMP1
	AESENC		\TMP1,\XMM0
	add		$16,%r10
	sub		$1,%eax
	jnz		_esb_loop_\@

	MOVADQ		(%r10),\TMP1
	AESENCLAST	\TMP1,\XMM0
.endm
1519 1520
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1521 1522
*                   struct gcm_context_data *data
*                                      // Context data
1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599
*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
*                   const u8 *in,      // Ciphertext input
*                   u64 plaintext_len, // Length of data in bytes for decryption.
*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                   const u8 *aad,     // Additional Authentication Data (AAD)
*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
*                                      // given authentication tag and only return the plaintext if they match.
*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
*                                      // (most likely), 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the first
*       set of 11 keys in the data structure void *aes_ctx
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                       AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                        AAD Format with 64-bit Extended Sequence Number
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/
ENTRY(aesni_gcm_dec)
1600
	FUNC_SAVE
1601

1602
	GCM_INIT %arg6, arg7, arg8, arg9
1603
	GCM_ENC_DEC dec
1604
	GCM_COMPLETE arg10, arg11
1605
	FUNC_RESTORE
1606
	ret
1607
ENDPROC(aesni_gcm_dec)
1608 1609 1610 1611


/*****************************************************************************
* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1612 1613
*                    struct gcm_context_data *data
*                                        // Context data
1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
*                    const u8 *in,       // Plaintext input
*                    u64 plaintext_len,  // Length of data in bytes for encryption.
*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                    const u8 *aad,      // Additional Authentication Data (AAD)
*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                    u8 *auth_tag,       // Authenticated Tag output.
*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
*                                        // 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the
*       first set of 11 keys in the data structure void *aes_ctx
*
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                 AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                         AAD Format with 64-bit Extended Sequence Number
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
***************************************************************************/
ENTRY(aesni_gcm_enc)
1688
	FUNC_SAVE
1689

1690
	GCM_INIT %arg6, arg7, arg8, arg9
1691
	GCM_ENC_DEC enc
1692 1693

	GCM_COMPLETE arg10, arg11
1694
	FUNC_RESTORE
1695
	ret
1696
ENDPROC(aesni_gcm_enc)
1697

1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760
/*****************************************************************************
* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
*                     struct gcm_context_data *data,
*                                         // context data
*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                     const u8 *aad,      // Additional Authentication Data (AAD)
*                     u64 aad_len)        // Length of AAD in bytes.
*/
ENTRY(aesni_gcm_init)
	FUNC_SAVE
	GCM_INIT %arg3, %arg4,%arg5, %arg6
	FUNC_RESTORE
	ret
ENDPROC(aesni_gcm_init)

/*****************************************************************************
* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
*                    struct gcm_context_data *data,
*                                        // context data
*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
*                    const u8 *in,       // Plaintext input
*                    u64 plaintext_len,  // Length of data in bytes for encryption.
*/
ENTRY(aesni_gcm_enc_update)
	FUNC_SAVE
	GCM_ENC_DEC enc
	FUNC_RESTORE
	ret
ENDPROC(aesni_gcm_enc_update)

/*****************************************************************************
* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
*                    struct gcm_context_data *data,
*                                        // context data
*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
*                    const u8 *in,       // Plaintext input
*                    u64 plaintext_len,  // Length of data in bytes for encryption.
*/
ENTRY(aesni_gcm_dec_update)
	FUNC_SAVE
	GCM_ENC_DEC dec
	FUNC_RESTORE
	ret
ENDPROC(aesni_gcm_dec_update)

/*****************************************************************************
* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
*                    struct gcm_context_data *data,
*                                        // context data
*                    u8 *auth_tag,       // Authenticated Tag output.
*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
*                                        // 12 or 8.
*/
ENTRY(aesni_gcm_finalize)
	FUNC_SAVE
	GCM_COMPLETE %arg3 %arg4
	FUNC_RESTORE
	ret
ENDPROC(aesni_gcm_finalize)

1761
#endif
1762 1763


1764
.align 4
1765 1766 1767 1768 1769 1770 1771 1772
_key_expansion_128:
_key_expansion_256a:
	pshufd $0b11111111, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0
1773 1774
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1775
	ret
1776 1777
ENDPROC(_key_expansion_128)
ENDPROC(_key_expansion_256a)
1778

1779
.align 4
1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796
_key_expansion_192a:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	movaps %xmm2, %xmm6
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

	movaps %xmm0, %xmm1
	shufps $0b01000100, %xmm0, %xmm6
1797
	movaps %xmm6, (TKEYP)
1798
	shufps $0b01001110, %xmm2, %xmm1
1799 1800
	movaps %xmm1, 0x10(TKEYP)
	add $0x20, TKEYP
1801
	ret
1802
ENDPROC(_key_expansion_192a)
1803

1804
.align 4
1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818
_key_expansion_192b:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

1819 1820
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1821
	ret
1822
ENDPROC(_key_expansion_192b)
1823

1824
.align 4
1825 1826 1827 1828 1829 1830 1831
_key_expansion_256b:
	pshufd $0b10101010, %xmm1, %xmm1
	shufps $0b00010000, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	shufps $0b10001100, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	pxor %xmm1, %xmm2
1832 1833
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1834
	ret
1835
ENDPROC(_key_expansion_256b)
1836 1837 1838 1839 1840 1841

/*
 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 *                   unsigned int key_len)
 */
ENTRY(aesni_set_key)
1842
	FRAME_BEGIN
1843 1844
#ifndef __x86_64__
	pushl KEYP
1845 1846 1847
	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1848 1849 1850 1851 1852
#endif
	movups (UKEYP), %xmm0		# user key (first 16 bytes)
	movaps %xmm0, (KEYP)
	lea 0x10(KEYP), TKEYP		# key addr
	movl %edx, 480(KEYP)
1853 1854 1855 1856
	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
	cmp $24, %dl
	jb .Lenc_key128
	je .Lenc_key192
1857 1858 1859
	movups 0x10(UKEYP), %xmm2	# other user key
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1860
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1861
	call _key_expansion_256a
1862
	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1863
	call _key_expansion_256b
1864
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1865
	call _key_expansion_256a
1866
	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1867
	call _key_expansion_256b
1868
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1869
	call _key_expansion_256a
1870
	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1871
	call _key_expansion_256b
1872
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1873
	call _key_expansion_256a
1874
	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1875
	call _key_expansion_256b
1876
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1877
	call _key_expansion_256a
1878
	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1879
	call _key_expansion_256b
1880
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1881
	call _key_expansion_256a
1882
	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1883
	call _key_expansion_256b
1884
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1885 1886 1887
	call _key_expansion_256a
	jmp .Ldec_key
.Lenc_key192:
1888
	movq 0x10(UKEYP), %xmm2		# other user key
1889
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1890
	call _key_expansion_192a
1891
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1892
	call _key_expansion_192b
1893
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1894
	call _key_expansion_192a
1895
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1896
	call _key_expansion_192b
1897
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1898
	call _key_expansion_192a
1899
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1900
	call _key_expansion_192b
1901
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1902
	call _key_expansion_192a
1903
	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1904 1905 1906
	call _key_expansion_192b
	jmp .Ldec_key
.Lenc_key128:
1907
	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1908
	call _key_expansion_128
1909
	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1910
	call _key_expansion_128
1911
	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1912
	call _key_expansion_128
1913
	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1914
	call _key_expansion_128
1915
	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1916
	call _key_expansion_128
1917
	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1918
	call _key_expansion_128
1919
	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1920
	call _key_expansion_128
1921
	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1922
	call _key_expansion_128
1923
	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1924
	call _key_expansion_128
1925
	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1926 1927
	call _key_expansion_128
.Ldec_key:
1928 1929 1930 1931 1932 1933 1934
	sub $0x10, TKEYP
	movaps (KEYP), %xmm0
	movaps (TKEYP), %xmm1
	movaps %xmm0, 240(TKEYP)
	movaps %xmm1, 240(KEYP)
	add $0x10, KEYP
	lea 240-16(TKEYP), UKEYP
1935 1936
.align 4
.Ldec_key_loop:
1937
	movaps (KEYP), %xmm0
1938
	AESIMC %xmm0 %xmm1
1939 1940 1941 1942
	movaps %xmm1, (UKEYP)
	add $0x10, KEYP
	sub $0x10, UKEYP
	cmp TKEYP, KEYP
1943
	jb .Ldec_key_loop
1944 1945 1946 1947
	xor AREG, AREG
#ifndef __x86_64__
	popl KEYP
#endif
1948
	FRAME_END
1949
	ret
1950
ENDPROC(aesni_set_key)
1951 1952 1953 1954 1955

/*
 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_enc)
1956
	FRAME_BEGIN
1957 1958 1959
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
1960 1961 1962
	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+20)(%esp), INP	# src
1963
#endif
1964 1965 1966 1967
	movl 480(KEYP), KLEN		# key length
	movups (INP), STATE		# input
	call _aesni_enc1
	movups STATE, (OUTP)		# output
1968 1969 1970 1971
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
1972
	FRAME_END
1973
	ret
1974
ENDPROC(aesni_enc)
1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987

/*
 * _aesni_enc1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
1988
.align 4
1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
_aesni_enc1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Lenc128
	lea 0x20(TKEYP), TKEYP
	je .Lenc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2000
	AESENC KEY STATE
2001
	movaps -0x50(TKEYP), KEY
2002
	AESENC KEY STATE
2003 2004 2005
.align 4
.Lenc192:
	movaps -0x40(TKEYP), KEY
2006
	AESENC KEY STATE
2007
	movaps -0x30(TKEYP), KEY
2008
	AESENC KEY STATE
2009 2010 2011
.align 4
.Lenc128:
	movaps -0x20(TKEYP), KEY
2012
	AESENC KEY STATE
2013
	movaps -0x10(TKEYP), KEY
2014
	AESENC KEY STATE
2015
	movaps (TKEYP), KEY
2016
	AESENC KEY STATE
2017
	movaps 0x10(TKEYP), KEY
2018
	AESENC KEY STATE
2019
	movaps 0x20(TKEYP), KEY
2020
	AESENC KEY STATE
2021
	movaps 0x30(TKEYP), KEY
2022
	AESENC KEY STATE
2023
	movaps 0x40(TKEYP), KEY
2024
	AESENC KEY STATE
2025
	movaps 0x50(TKEYP), KEY
2026
	AESENC KEY STATE
2027
	movaps 0x60(TKEYP), KEY
2028
	AESENC KEY STATE
2029
	movaps 0x70(TKEYP), KEY
2030
	AESENCLAST KEY STATE
2031
	ret
2032
ENDPROC(_aesni_enc1)
2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051

/*
 * _aesni_enc4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
2052
.align 4
2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066
_aesni_enc4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4enc128
	lea 0x20(TKEYP), TKEYP
	je .L4enc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2067 2068 2069 2070
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2071
	movaps -0x50(TKEYP), KEY
2072 2073 2074 2075
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2076 2077 2078
#.align 4
.L4enc192:
	movaps -0x40(TKEYP), KEY
2079 2080 2081 2082
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2083
	movaps -0x30(TKEYP), KEY
2084 2085 2086 2087
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2088 2089 2090
#.align 4
.L4enc128:
	movaps -0x20(TKEYP), KEY
2091 2092 2093 2094
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2095
	movaps -0x10(TKEYP), KEY
2096 2097 2098 2099
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2100
	movaps (TKEYP), KEY
2101 2102 2103 2104
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2105
	movaps 0x10(TKEYP), KEY
2106 2107 2108 2109
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2110
	movaps 0x20(TKEYP), KEY
2111 2112 2113 2114
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2115
	movaps 0x30(TKEYP), KEY
2116 2117 2118 2119
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2120
	movaps 0x40(TKEYP), KEY
2121 2122 2123 2124
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2125
	movaps 0x50(TKEYP), KEY
2126 2127 2128 2129
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2130
	movaps 0x60(TKEYP), KEY
2131 2132 2133 2134
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2135
	movaps 0x70(TKEYP), KEY
2136 2137 2138 2139
	AESENCLAST KEY STATE1		# last round
	AESENCLAST KEY STATE2
	AESENCLAST KEY STATE3
	AESENCLAST KEY STATE4
2140
	ret
2141
ENDPROC(_aesni_enc4)
2142 2143 2144 2145 2146

/*
 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_dec)
2147
	FRAME_BEGIN
2148 2149 2150
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
2151 2152 2153
	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+20)(%esp), INP	# src
2154
#endif
2155 2156 2157 2158 2159
	mov 480(KEYP), KLEN		# key length
	add $240, KEYP
	movups (INP), STATE		# input
	call _aesni_dec1
	movups STATE, (OUTP)		#output
2160 2161 2162 2163
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
2164
	FRAME_END
2165
	ret
2166
ENDPROC(aesni_dec)
2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179

/*
 * _aesni_dec1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
2180
.align 4
2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191
_aesni_dec1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Ldec128
	lea 0x20(TKEYP), TKEYP
	je .Ldec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2192
	AESDEC KEY STATE
2193
	movaps -0x50(TKEYP), KEY
2194
	AESDEC KEY STATE
2195 2196 2197
.align 4
.Ldec192:
	movaps -0x40(TKEYP), KEY
2198
	AESDEC KEY STATE
2199
	movaps -0x30(TKEYP), KEY
2200
	AESDEC KEY STATE
2201 2202 2203
.align 4
.Ldec128:
	movaps -0x20(TKEYP), KEY
2204
	AESDEC KEY STATE
2205
	movaps -0x10(TKEYP), KEY
2206
	AESDEC KEY STATE
2207
	movaps (TKEYP), KEY
2208
	AESDEC KEY STATE
2209
	movaps 0x10(TKEYP), KEY
2210
	AESDEC KEY STATE
2211
	movaps 0x20(TKEYP), KEY
2212
	AESDEC KEY STATE
2213
	movaps 0x30(TKEYP), KEY
2214
	AESDEC KEY STATE
2215
	movaps 0x40(TKEYP), KEY
2216
	AESDEC KEY STATE
2217
	movaps 0x50(TKEYP), KEY
2218
	AESDEC KEY STATE
2219
	movaps 0x60(TKEYP), KEY
2220
	AESDEC KEY STATE
2221
	movaps 0x70(TKEYP), KEY
2222
	AESDECLAST KEY STATE
2223
	ret
2224
ENDPROC(_aesni_dec1)
2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243

/*
 * _aesni_dec4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
2244
.align 4
2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
_aesni_dec4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4dec128
	lea 0x20(TKEYP), TKEYP
	je .L4dec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2259 2260 2261 2262
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2263
	movaps -0x50(TKEYP), KEY
2264 2265 2266 2267
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2268 2269 2270
.align 4
.L4dec192:
	movaps -0x40(TKEYP), KEY
2271 2272 2273 2274
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2275
	movaps -0x30(TKEYP), KEY
2276 2277 2278 2279
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2280 2281 2282
.align 4
.L4dec128:
	movaps -0x20(TKEYP), KEY
2283 2284 2285 2286
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2287
	movaps -0x10(TKEYP), KEY
2288 2289 2290 2291
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2292
	movaps (TKEYP), KEY
2293 2294 2295 2296
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2297
	movaps 0x10(TKEYP), KEY
2298 2299 2300 2301
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2302
	movaps 0x20(TKEYP), KEY
2303 2304 2305 2306
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2307
	movaps 0x30(TKEYP), KEY
2308 2309 2310 2311
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2312
	movaps 0x40(TKEYP), KEY
2313 2314 2315 2316
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2317
	movaps 0x50(TKEYP), KEY
2318 2319 2320 2321
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2322
	movaps 0x60(TKEYP), KEY
2323 2324 2325 2326
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2327
	movaps 0x70(TKEYP), KEY
2328 2329 2330 2331
	AESDECLAST KEY STATE1		# last round
	AESDECLAST KEY STATE2
	AESDECLAST KEY STATE3
	AESDECLAST KEY STATE4
2332
	ret
2333
ENDPROC(_aesni_dec4)
2334 2335 2336 2337 2338 2339

/*
 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len)
 */
ENTRY(aesni_ecb_enc)
2340
	FRAME_BEGIN
2341 2342 2343 2344
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
2345 2346 2347 2348
	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+24)(%esp), INP	# src
	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2349
#endif
2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385
	test LEN, LEN		# check length
	jz .Lecb_enc_ret
	mov 480(KEYP), KLEN
	cmp $16, LEN
	jb .Lecb_enc_ret
	cmp $64, LEN
	jb .Lecb_enc_loop1
.align 4
.Lecb_enc_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_enc4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_enc_loop4
	cmp $16, LEN
	jb .Lecb_enc_ret
.align 4
.Lecb_enc_loop1:
	movups (INP), STATE1
	call _aesni_enc1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_enc_loop1
.Lecb_enc_ret:
2386 2387 2388 2389 2390
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
2391
	FRAME_END
2392
	ret
2393
ENDPROC(aesni_ecb_enc)
2394 2395 2396 2397 2398 2399

/*
 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len);
 */
ENTRY(aesni_ecb_dec)
2400
	FRAME_BEGIN
2401 2402 2403 2404
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
2405 2406 2407 2408
	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+24)(%esp), INP	# src
	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2409
#endif
2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446
	test LEN, LEN
	jz .Lecb_dec_ret
	mov 480(KEYP), KLEN
	add $240, KEYP
	cmp $16, LEN
	jb .Lecb_dec_ret
	cmp $64, LEN
	jb .Lecb_dec_loop1
.align 4
.Lecb_dec_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_dec4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_dec_loop4
	cmp $16, LEN
	jb .Lecb_dec_ret
.align 4
.Lecb_dec_loop1:
	movups (INP), STATE1
	call _aesni_dec1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_dec_loop1
.Lecb_dec_ret:
2447 2448 2449 2450 2451
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
2452
	FRAME_END
2453
	ret
2454
ENDPROC(aesni_ecb_dec)
2455 2456 2457 2458 2459 2460

/*
 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_enc)
2461
	FRAME_BEGIN
2462 2463 2464 2465 2466
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
2467 2468 2469 2470 2471
	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+28)(%esp), INP	# src
	movl (FRAME_OFFSET+32)(%esp), LEN	# len
	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2472
#endif
2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489
	cmp $16, LEN
	jb .Lcbc_enc_ret
	mov 480(KEYP), KLEN
	movups (IVP), STATE	# load iv as initial state
.align 4
.Lcbc_enc_loop:
	movups (INP), IN	# load input
	pxor IN, STATE
	call _aesni_enc1
	movups STATE, (OUTP)	# store output
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_enc_loop
	movups STATE, (IVP)
.Lcbc_enc_ret:
2490 2491 2492 2493 2494 2495
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
2496
	FRAME_END
2497
	ret
2498
ENDPROC(aesni_cbc_enc)
2499 2500 2501 2502 2503 2504

/*
 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_dec)
2505
	FRAME_BEGIN
2506 2507 2508 2509 2510
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
2511 2512 2513 2514 2515
	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+28)(%esp), INP	# src
	movl (FRAME_OFFSET+32)(%esp), LEN	# len
	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2516
#endif
2517
	cmp $16, LEN
2518
	jb .Lcbc_dec_just_ret
2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529
	mov 480(KEYP), KLEN
	add $240, KEYP
	movups (IVP), IV
	cmp $64, LEN
	jb .Lcbc_dec_loop1
.align 4
.Lcbc_dec_loop4:
	movups (INP), IN1
	movaps IN1, STATE1
	movups 0x10(INP), IN2
	movaps IN2, STATE2
2530
#ifdef __x86_64__
2531 2532 2533 2534
	movups 0x20(INP), IN3
	movaps IN3, STATE3
	movups 0x30(INP), IN4
	movaps IN4, STATE4
2535 2536 2537 2538 2539 2540
#else
	movups 0x20(INP), IN1
	movaps IN1, STATE3
	movups 0x30(INP), IN2
	movaps IN2, STATE4
#endif
2541 2542
	call _aesni_dec4
	pxor IV, STATE1
2543
#ifdef __x86_64__
2544 2545 2546 2547
	pxor IN1, STATE2
	pxor IN2, STATE3
	pxor IN3, STATE4
	movaps IN4, IV
2548 2549 2550
#else
	pxor IN1, STATE4
	movaps IN2, IV
2551 2552 2553 2554
	movups (INP), IN1
	pxor IN1, STATE2
	movups 0x10(INP), IN2
	pxor IN2, STATE3
2555
#endif
2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lcbc_dec_loop4
	cmp $16, LEN
	jb .Lcbc_dec_ret
.align 4
.Lcbc_dec_loop1:
	movups (INP), IN
	movaps IN, STATE
	call _aesni_dec1
	pxor IV, STATE
	movups STATE, (OUTP)
	movaps IN, IV
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_dec_loop1
.Lcbc_dec_ret:
2581 2582
	movups IV, (IVP)
.Lcbc_dec_just_ret:
2583 2584 2585 2586 2587 2588
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
2589
	FRAME_END
2590
	ret
2591
ENDPROC(aesni_cbc_dec)
2592

2593
#ifdef __x86_64__
2594
.pushsection .rodata
2595 2596 2597
.align 16
.Lbswap_mask:
	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2598
.popsection
2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610

/*
 * _aesni_inc_init:	internal ABI
 *	setup registers used by _aesni_inc
 * input:
 *	IV
 * output:
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 */
2611
.align 4
2612 2613 2614 2615 2616
_aesni_inc_init:
	movaps .Lbswap_mask, BSWAP_MASK
	movaps IV, CTR
	PSHUFB_XMM BSWAP_MASK CTR
	mov $1, TCTR_LOW
2617 2618
	MOVQ_R64_XMM TCTR_LOW INC
	MOVQ_R64_XMM CTR TCTR_LOW
2619
	ret
2620
ENDPROC(_aesni_inc_init)
2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636

/*
 * _aesni_inc:		internal ABI
 *	Increase IV by 1, IV is in big endian
 * input:
 *	IV
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 * output:
 *	IV:	Increase by 1
 * changed:
 *	CTR:	== output IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 */
2637
.align 4
2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648
_aesni_inc:
	paddq INC, CTR
	add $1, TCTR_LOW
	jnc .Linc_low
	pslldq $8, INC
	paddq INC, CTR
	psrldq $8, INC
.Linc_low:
	movaps CTR, IV
	PSHUFB_XMM BSWAP_MASK IV
	ret
2649
ENDPROC(_aesni_inc)
2650 2651 2652 2653 2654 2655

/*
 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_ctr_enc)
2656
	FRAME_BEGIN
2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709
	cmp $16, LEN
	jb .Lctr_enc_just_ret
	mov 480(KEYP), KLEN
	movups (IVP), IV
	call _aesni_inc_init
	cmp $64, LEN
	jb .Lctr_enc_loop1
.align 4
.Lctr_enc_loop4:
	movaps IV, STATE1
	call _aesni_inc
	movups (INP), IN1
	movaps IV, STATE2
	call _aesni_inc
	movups 0x10(INP), IN2
	movaps IV, STATE3
	call _aesni_inc
	movups 0x20(INP), IN3
	movaps IV, STATE4
	call _aesni_inc
	movups 0x30(INP), IN4
	call _aesni_enc4
	pxor IN1, STATE1
	movups STATE1, (OUTP)
	pxor IN2, STATE2
	movups STATE2, 0x10(OUTP)
	pxor IN3, STATE3
	movups STATE3, 0x20(OUTP)
	pxor IN4, STATE4
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lctr_enc_loop4
	cmp $16, LEN
	jb .Lctr_enc_ret
.align 4
.Lctr_enc_loop1:
	movaps IV, STATE
	call _aesni_inc
	movups (INP), IN
	call _aesni_enc1
	pxor IN, STATE
	movups STATE, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lctr_enc_loop1
.Lctr_enc_ret:
	movups IV, (IVP)
.Lctr_enc_just_ret:
2710
	FRAME_END
2711
	ret
2712
ENDPROC(aesni_ctr_enc)
2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736

/*
 * _aesni_gf128mul_x_ble:		internal ABI
 *	Multiply in GF(2^128) for XTS IVs
 * input:
 *	IV:	current IV
 *	GF128MUL_MASK == mask with 0x87 and 0x01
 * output:
 *	IV:	next IV
 * changed:
 *	CTR:	== temporary value
 */
#define _aesni_gf128mul_x_ble() \
	pshufd $0x13, IV, CTR; \
	paddq IV, IV; \
	psrad $31, CTR; \
	pand GF128MUL_MASK, CTR; \
	pxor CTR, IV;

/*
 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *			 bool enc, u8 *iv)
 */
ENTRY(aesni_xts_crypt8)
2737
	FRAME_BEGIN
2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752
	cmpb $0, %cl
	movl $0, %ecx
	movl $240, %r10d
	leaq _aesni_enc4, %r11
	leaq _aesni_dec4, %rax
	cmovel %r10d, %ecx
	cmoveq %rax, %r11

	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
	movups (IVP), IV

	mov 480(KEYP), KLEN
	addq %rcx, KEYP

	movdqa IV, STATE1
2753 2754
	movdqu 0x00(INP), INC
	pxor INC, STATE1
2755 2756 2757 2758
	movdqu IV, 0x00(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE2
2759 2760
	movdqu 0x10(INP), INC
	pxor INC, STATE2
2761 2762 2763 2764
	movdqu IV, 0x10(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE3
2765 2766
	movdqu 0x20(INP), INC
	pxor INC, STATE3
2767 2768 2769 2770
	movdqu IV, 0x20(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE4
2771 2772
	movdqu 0x30(INP), INC
	pxor INC, STATE4
2773 2774
	movdqu IV, 0x30(OUTP)

2775
	CALL_NOSPEC %r11
2776

2777 2778
	movdqu 0x00(OUTP), INC
	pxor INC, STATE1
2779 2780 2781 2782
	movdqu STATE1, 0x00(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE1
2783 2784
	movdqu 0x40(INP), INC
	pxor INC, STATE1
2785 2786
	movdqu IV, 0x40(OUTP)

2787 2788
	movdqu 0x10(OUTP), INC
	pxor INC, STATE2
2789 2790 2791 2792
	movdqu STATE2, 0x10(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE2
2793 2794
	movdqu 0x50(INP), INC
	pxor INC, STATE2
2795 2796
	movdqu IV, 0x50(OUTP)

2797 2798
	movdqu 0x20(OUTP), INC
	pxor INC, STATE3
2799 2800 2801 2802
	movdqu STATE3, 0x20(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE3
2803 2804
	movdqu 0x60(INP), INC
	pxor INC, STATE3
2805 2806
	movdqu IV, 0x60(OUTP)

2807 2808
	movdqu 0x30(OUTP), INC
	pxor INC, STATE4
2809 2810 2811 2812
	movdqu STATE4, 0x30(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE4
2813 2814
	movdqu 0x70(INP), INC
	pxor INC, STATE4
2815 2816 2817 2818 2819
	movdqu IV, 0x70(OUTP)

	_aesni_gf128mul_x_ble()
	movups IV, (IVP)

2820
	CALL_NOSPEC %r11
2821

2822 2823
	movdqu 0x40(OUTP), INC
	pxor INC, STATE1
2824 2825
	movdqu STATE1, 0x40(OUTP)

2826 2827
	movdqu 0x50(OUTP), INC
	pxor INC, STATE2
2828 2829
	movdqu STATE2, 0x50(OUTP)

2830 2831
	movdqu 0x60(OUTP), INC
	pxor INC, STATE3
2832 2833
	movdqu STATE3, 0x60(OUTP)

2834 2835
	movdqu 0x70(OUTP), INC
	pxor INC, STATE4
2836 2837
	movdqu STATE4, 0x70(OUTP)

2838
	FRAME_END
2839 2840 2841
	ret
ENDPROC(aesni_xts_crypt8)

2842
#endif