aesni-intel_asm.S 81.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 * Implement AES algorithm in Intel AES-NI instructions.
 *
 * The white paper of AES-NI instructions can be downloaded from:
 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
 *
 * Copyright (C) 2008, Intel Corp.
 *    Author: Huang Ying <ying.huang@intel.com>
 *            Vinodh Gopal <vinodh.gopal@intel.com>
 *            Kahraman Akdemir
 *
12 13 14 15 16 17 18 19 20 21 22
 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
 * interface for 64-bit kernels.
 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
 *             Adrian Hoban <adrian.hoban@intel.com>
 *             James Guilford (james.guilford@intel.com)
 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
 *             Tadeusz Struk (tadeusz.struk@intel.com)
 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
 *    Copyright (c) 2010, Intel Corporation.
 *
23 24 25
 * Ported x86_64 version to x86:
 *    Author: Mathias Krause <minipli@googlemail.com>
 *
26 27 28 29 30 31 32
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */

#include <linux/linkage.h>
33
#include <asm/inst.h>
34
#include <asm/frame.h>
35

36 37 38 39 40 41 42 43 44 45 46
/*
 * The following macros are used to move an (un)aligned 16 byte value to/from
 * an XMM register.  This can done for either FP or integer values, for FP use
 * movaps (move aligned packed single) or integer use movdqa (move double quad
 * aligned).  It doesn't make a performance difference which instruction is used
 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
 * shorter, so that is the one we'll use for now. (same for unaligned).
 */
#define MOVADQ	movaps
#define MOVUDQ	movups

47
#ifdef __x86_64__
48

49 50
# constants in mergeable sections, linker can reorder and merge
.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
51 52 53
.align 16
.Lgf128mul_x_ble_mask:
	.octa 0x00000000000000010000000000000087
54 55
.section	.rodata.cst16.POLY, "aM", @progbits, 16
.align 16
56
POLY:   .octa 0xC2000000000000000000000000000001
57 58
.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
.align 16
59 60
TWOONE: .octa 0x00000001000000000000000000000001

61 62
.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
.align 16
63
SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
64 65
.section	.rodata.cst16.MASK1, "aM", @progbits, 16
.align 16
66
MASK1:      .octa 0x0000000000000000ffffffffffffffff
67 68
.section	.rodata.cst16.MASK2, "aM", @progbits, 16
.align 16
69
MASK2:      .octa 0xffffffffffffffff0000000000000000
70 71
.section	.rodata.cst16.ONE, "aM", @progbits, 16
.align 16
72
ONE:        .octa 0x00000000000000000000000000000001
73 74
.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
.align 16
75
F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76 77
.section	.rodata.cst16.dec, "aM", @progbits, 16
.align 16
78
dec:        .octa 0x1
79 80
.section	.rodata.cst16.enc, "aM", @progbits, 16
.align 16
81 82
enc:        .octa 0x2

83 84 85 86 87 88 89 90 91
# order of these constants should not change.
# more specifically, ALL_F should follow SHIFT_MASK,
# and zero should follow ALL_F
.section	.rodata, "a", @progbits
.align 16
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
            .octa 0x00000000000000000000000000000000

92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
.section .rodata
.align 16
.type aad_shift_arr, @object
.size aad_shift_arr, 272
aad_shift_arr:
        .octa     0xffffffffffffffffffffffffffffffff
        .octa     0xffffffffffffffffffffffffffffff0C
        .octa     0xffffffffffffffffffffffffffff0D0C
        .octa     0xffffffffffffffffffffffffff0E0D0C
        .octa     0xffffffffffffffffffffffff0F0E0D0C
        .octa     0xffffffffffffffffffffff0C0B0A0908
        .octa     0xffffffffffffffffffff0D0C0B0A0908
        .octa     0xffffffffffffffffff0E0D0C0B0A0908
        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
        .octa     0xffffffffffffff0C0B0A090807060504
        .octa     0xffffffffffff0D0C0B0A090807060504
        .octa     0xffffffffff0E0D0C0B0A090807060504
        .octa     0xffffffff0F0E0D0C0B0A090807060504
        .octa     0xffffff0C0B0A09080706050403020100
        .octa     0xffff0D0C0B0A09080706050403020100
        .octa     0xff0E0D0C0B0A09080706050403020100
        .octa     0x0F0E0D0C0B0A09080706050403020100

115

116 117
.text

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147

#define	STACK_OFFSET    8*3
#define	HashKey		16*0	// store HashKey <<1 mod poly here
#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
				// bits of  HashKey <<1 mod poly here
				//(for Karatsuba purposes)
#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^2 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^3 <<1 mod poly here
				// (for Karatsuba purposes)
#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
				// bits of  HashKey^4 <<1 mod poly here
				// (for Karatsuba purposes)
#define	VARIABLE_OFFSET	16*8

#define arg1 rdi
#define arg2 rsi
#define arg3 rdx
#define arg4 rcx
#define arg5 r8
#define arg6 r9
#define arg7 STACK_OFFSET+8(%r14)
#define arg8 STACK_OFFSET+16(%r14)
#define arg9 STACK_OFFSET+24(%r14)
#define arg10 STACK_OFFSET+32(%r14)
148
#define keysize 2*15*16(%arg1)
149
#endif
150 151


152 153 154 155 156 157 158 159 160 161 162 163
#define STATE1	%xmm0
#define STATE2	%xmm4
#define STATE3	%xmm5
#define STATE4	%xmm6
#define STATE	STATE1
#define IN1	%xmm1
#define IN2	%xmm7
#define IN3	%xmm8
#define IN4	%xmm9
#define IN	IN1
#define KEY	%xmm2
#define IV	%xmm3
164

165 166 167
#define BSWAP_MASK %xmm10
#define CTR	%xmm11
#define INC	%xmm12
168

169 170
#define GF128MUL_MASK %xmm10

171 172
#ifdef __x86_64__
#define AREG	%rax
173 174
#define KEYP	%rdi
#define OUTP	%rsi
175
#define UKEYP	OUTP
176 177 178 179 180 181 182
#define INP	%rdx
#define LEN	%rcx
#define IVP	%r8
#define KLEN	%r9d
#define T1	%r10
#define TKEYP	T1
#define T2	%r11
183
#define TCTR_LOW T2
184 185 186 187 188 189 190 191 192 193 194 195
#else
#define AREG	%eax
#define KEYP	%edi
#define OUTP	AREG
#define UKEYP	OUTP
#define INP	%edx
#define LEN	%esi
#define IVP	%ebp
#define KLEN	%ebx
#define T1	%ecx
#define TKEYP	T1
#endif
196

197

198
#ifdef __x86_64__
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
*
*
* Input: A and B (128-bits each, bit-reflected)
* Output: C = A*B*x mod poly, (i.e. >>1 )
* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
*
*/
.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
	movdqa	  \GH, \TMP1
	pshufd	  $78, \GH, \TMP2
	pshufd	  $78, \HK, \TMP3
	pxor	  \GH, \TMP2            # TMP2 = a1+a0
	pxor	  \HK, \TMP3            # TMP3 = b1+b0
	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
	pxor	  \GH, \TMP2
	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
	pxor	  \TMP3, \GH
	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK

        # first phase of the reduction

	movdqa    \GH, \TMP2
	movdqa    \GH, \TMP3
	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	pslld     $31, \TMP2            # packed right shift <<31
	pslld     $30, \TMP3            # packed right shift <<30
	pslld     $25, \TMP4            # packed right shift <<25
	pxor      \TMP3, \TMP2          # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5             # right shift TMP5 1 DW
	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
	pxor      \TMP2, \GH

        # second phase of the reduction

	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
					# in in order to perform
					# independent shifts
	movdqa    \GH,\TMP3
	movdqa    \GH,\TMP4
	psrld     $1,\TMP2              # packed left shift >>1
	psrld     $2,\TMP3              # packed left shift >>2
	psrld     $7,\TMP4              # packed left shift >>7
	pxor      \TMP3,\TMP2		# xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \GH
	pxor      \TMP1, \GH            # result is in TMP1
.endm

259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
# Reads DLEN bytes starting at DPTR and stores in XMMDst
# where 0 < DLEN < 16
# Clobbers %rax, DLEN and XMM1
.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
        cmp $8, \DLEN
        jl _read_lt8_\@
        mov (\DPTR), %rax
        MOVQ_R64_XMM %rax, \XMMDst
        sub $8, \DLEN
        jz _done_read_partial_block_\@
	xor %eax, %eax
_read_next_byte_\@:
        shl $8, %rax
        mov 7(\DPTR, \DLEN, 1), %al
        dec \DLEN
        jnz _read_next_byte_\@
        MOVQ_R64_XMM %rax, \XMM1
	pslldq $8, \XMM1
        por \XMM1, \XMMDst
	jmp _done_read_partial_block_\@
_read_lt8_\@:
	xor %eax, %eax
_read_next_byte_lt8_\@:
        shl $8, %rax
        mov -1(\DPTR, \DLEN, 1), %al
        dec \DLEN
        jnz _read_next_byte_lt8_\@
        MOVQ_R64_XMM %rax, \XMMDst
_done_read_partial_block_\@:
.endm

290 291 292 293 294 295 296 297 298 299 300 301
/*
* if a = number of total plaintext bytes
* b = floor(a/16)
* num_initial_blocks = b mod 4
* encrypt the initial num_initial_blocks blocks and apply ghash on
* the ciphertext
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
* are clobbered
* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
*/


302 303
.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
304
        MOVADQ     SHUF_MASK(%rip), %xmm14
305 306 307 308
	mov	   arg7, %r10           # %r10 = AAD
	mov	   arg8, %r12           # %r12 = aadLen
	mov	   %r12, %r11
	pxor	   %xmm\i, %xmm\i
309
	pxor       \XMM2, \XMM2
310

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
	cmp	   $16, %r11
	jl	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
	movdqu	   (%r10), %xmm\i
	PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
	pxor	   %xmm\i, \XMM2
	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
	add	   $16, %r10
	sub	   $16, %r12
	sub	   $16, %r11
	cmp	   $16, %r11
	jge	   _get_AAD_blocks\num_initial_blocks\operation

	movdqu	   \XMM2, %xmm\i
	cmp	   $0, %r11
	je	   _get_AAD_done\num_initial_blocks\operation

	pxor	   %xmm\i,%xmm\i

	/* read the last <16B of AAD. since we have at least 4B of
	data right after the AAD (the ICV, and maybe some CT), we can
	read 4B/8B blocks safely, and then get rid of the extra stuff */
_get_AAD_rest8\num_initial_blocks\operation:
	cmp	   $4, %r11
	jle	   _get_AAD_rest4\num_initial_blocks\operation
	movq	   (%r10), \TMP1
	add	   $8, %r10
	sub	   $8, %r11
	pslldq	   $8, \TMP1
	psrldq	   $8, %xmm\i
341
	pxor	   \TMP1, %xmm\i
342 343 344 345 346 347
	jmp	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_rest4\num_initial_blocks\operation:
	cmp	   $0, %r11
	jle	   _get_AAD_rest0\num_initial_blocks\operation
	mov	   (%r10), %eax
	movq	   %rax, \TMP1
348
	add	   $4, %r10
349 350
	sub	   $4, %r10
	pslldq	   $12, \TMP1
351
	psrldq	   $4, %xmm\i
352 353 354 355 356 357 358 359 360 361
	pxor	   \TMP1, %xmm\i
_get_AAD_rest0\num_initial_blocks\operation:
	/* finalize: shift out the extra bytes we read, and align
	left. since pslldq can only shift by an immediate, we use
	vpshufb and an array of shuffle masks */
	movq	   %r12, %r11
	salq	   $4, %r11
	movdqu	   aad_shift_arr(%r11), \TMP1
	PSHUFB_XMM \TMP1, %xmm\i
_get_AAD_rest_final\num_initial_blocks\operation:
362
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
363 364
	pxor	   \XMM2, %xmm\i
	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
365

366
_get_AAD_done\num_initial_blocks\operation:
367
	xor	   %r11, %r11 # initialise the data pointer offset as zero
368
	# start AES for num_initial_blocks blocks
369 370 371

	mov	   %arg5, %rax                      # %rax = *Y0
	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
372 373 374
	PSHUFB_XMM   %xmm14, \XMM0

.if (\i == 5) || (\i == 6) || (\i == 7)
375 376
	MOVADQ		ONE(%RIP),\TMP1
	MOVADQ		(%arg1),\TMP2
377
.irpc index, \i_seq
378
	paddd	   \TMP1, \XMM0                 # INCR Y0
379
	movdqa	   \XMM0, %xmm\index
380
	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
381
	pxor	   \TMP2, %xmm\index
382
.endr
383 384 385 386 387 388 389 390 391
	lea	0x10(%arg1),%r10
	mov	keysize,%eax
	shr	$2,%eax				# 128->4, 192->6, 256->8
	add	$5,%eax			      # 128->9, 192->11, 256->13

aes_loop_initial_dec\num_initial_blocks:
	MOVADQ	(%r10),\TMP1
.irpc	index, \i_seq
	AESENC	\TMP1, %xmm\index
392
.endr
393 394 395 396 397
	add	$16,%r10
	sub	$1,%eax
	jnz	aes_loop_initial_dec\num_initial_blocks

	MOVADQ	(%r10), \TMP1
398
.irpc index, \i_seq
399
	AESENCLAST \TMP1, %xmm\index         # Last Round
400 401 402 403 404 405 406
.endr
.irpc index, \i_seq
	movdqu	   (%arg3 , %r11, 1), \TMP1
	pxor	   \TMP1, %xmm\index
	movdqu	   %xmm\index, (%arg2 , %r11, 1)
	# write back plaintext/ciphertext for num_initial_blocks
	add	   $16, %r11
407

408
	movdqa     \TMP1, %xmm\index
409
	PSHUFB_XMM	   %xmm14, %xmm\index
410
                # prepare plaintext/ciphertext for GHASH computation
411 412
.endr
.endif
413

414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
        # apply GHASH on num_initial_blocks blocks

.if \i == 5
        pxor       %xmm5, %xmm6
	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 6
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 7
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
	cmp	   $64, %r13
	jl	_initial_blocks_done\num_initial_blocks\operation
	# no need for precomputed values
/*
*
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
440 441 442
	MOVADQ	   ONE(%rip), \TMP1
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM1
443 444
	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap

445 446
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM2
447 448
	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap

449 450
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM3
451 452
	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap

453 454
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM4
455 456
	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap

457 458 459 460 461
	MOVADQ	   0(%arg1),\TMP1
	pxor	   \TMP1, \XMM1
	pxor	   \TMP1, \XMM2
	pxor	   \TMP1, \XMM3
	pxor	   \TMP1, \XMM4
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
	movdqa	   \TMP3, \TMP5
	pshufd	   $78, \TMP3, \TMP1
	pxor	   \TMP3, \TMP1
	movdqa	   \TMP1, HashKey_k(%rsp)
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
	movdqa	   \TMP5, HashKey_2(%rsp)
# HashKey_2 = HashKey^2<<1 (mod poly)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_2_k(%rsp)
.irpc index, 1234 # do 4 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_3(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_3_k(%rsp)
.irpc index, 56789 # do next 5 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_4(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_4_k(%rsp)
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
	lea	   0xa0(%arg1),%r10
	mov	   keysize,%eax
	shr	   $2,%eax			# 128->4, 192->6, 256->8
	sub	   $4,%eax			# 128->0, 192->2, 256->4
	jz	   aes_loop_pre_dec_done\num_initial_blocks

aes_loop_pre_dec\num_initial_blocks:
	MOVADQ	   (%r10),\TMP2
.irpc	index, 1234
	AESENC	   \TMP2, %xmm\index
.endr
	add	   $16,%r10
	sub	   $1,%eax
	jnz	   aes_loop_pre_dec\num_initial_blocks

aes_loop_pre_dec_done\num_initial_blocks:
	MOVADQ	   (%r10), \TMP2
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
	AESENCLAST \TMP2, \XMM1
	AESENCLAST \TMP2, \XMM2
	AESENCLAST \TMP2, \XMM3
	AESENCLAST \TMP2, \XMM4
	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM1
	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM1
	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM2
	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM2
	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM3
	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM3
	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM4
	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
	movdqa     \TMP1, \XMM4
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
	add	   $64, %r11
	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
	pxor	   \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap

_initial_blocks_done\num_initial_blocks\operation:

.endm


/*
* if a = number of total plaintext bytes
* b = floor(a/16)
* num_initial_blocks = b mod 4
* encrypt the initial num_initial_blocks blocks and apply ghash on
* the ciphertext
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
* are clobbered
* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
*/


.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
563
        MOVADQ     SHUF_MASK(%rip), %xmm14
564 565 566 567
	mov	   arg7, %r10           # %r10 = AAD
	mov	   arg8, %r12           # %r12 = aadLen
	mov	   %r12, %r11
	pxor	   %xmm\i, %xmm\i
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
	pxor	   \XMM2, \XMM2

	cmp	   $16, %r11
	jl	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
	movdqu	   (%r10), %xmm\i
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
	pxor	   %xmm\i, \XMM2
	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
	add	   $16, %r10
	sub	   $16, %r12
	sub	   $16, %r11
	cmp	   $16, %r11
	jge	   _get_AAD_blocks\num_initial_blocks\operation

	movdqu	   \XMM2, %xmm\i
	cmp	   $0, %r11
	je	   _get_AAD_done\num_initial_blocks\operation

	pxor	   %xmm\i,%xmm\i

	/* read the last <16B of AAD. since we have at least 4B of
	data right after the AAD (the ICV, and maybe some PT), we can
	read 4B/8B blocks safely, and then get rid of the extra stuff */
_get_AAD_rest8\num_initial_blocks\operation:
	cmp	   $4, %r11
	jle	   _get_AAD_rest4\num_initial_blocks\operation
	movq	   (%r10), \TMP1
	add	   $8, %r10
	sub	   $8, %r11
	pslldq	   $8, \TMP1
	psrldq	   $8, %xmm\i
600
	pxor	   \TMP1, %xmm\i
601 602 603 604 605 606
	jmp	   _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_rest4\num_initial_blocks\operation:
	cmp	   $0, %r11
	jle	   _get_AAD_rest0\num_initial_blocks\operation
	mov	   (%r10), %eax
	movq	   %rax, \TMP1
607
	add	   $4, %r10
608 609
	sub	   $4, %r10
	pslldq	   $12, \TMP1
610
	psrldq	   $4, %xmm\i
611 612 613 614 615 616 617 618 619 620
	pxor	   \TMP1, %xmm\i
_get_AAD_rest0\num_initial_blocks\operation:
	/* finalize: shift out the extra bytes we read, and align
	left. since pslldq can only shift by an immediate, we use
	vpshufb and an array of shuffle masks */
	movq	   %r12, %r11
	salq	   $4, %r11
	movdqu	   aad_shift_arr(%r11), \TMP1
	PSHUFB_XMM \TMP1, %xmm\i
_get_AAD_rest_final\num_initial_blocks\operation:
621
	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
622 623
	pxor	   \XMM2, %xmm\i
	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
624

625
_get_AAD_done\num_initial_blocks\operation:
626
	xor	   %r11, %r11 # initialise the data pointer offset as zero
627
	# start AES for num_initial_blocks blocks
628 629 630 631 632 633 634

	mov	   %arg5, %rax                      # %rax = *Y0
	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
	PSHUFB_XMM   %xmm14, \XMM0

.if (\i == 5) || (\i == 6) || (\i == 7)

635 636
	MOVADQ		ONE(%RIP),\TMP1
	MOVADQ		0(%arg1),\TMP2
637
.irpc index, \i_seq
638 639 640 641
	paddd		\TMP1, \XMM0                 # INCR Y0
	MOVADQ		\XMM0, %xmm\index
	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
	pxor		\TMP2, %xmm\index
642
.endr
643 644 645 646 647 648 649 650 651
	lea	0x10(%arg1),%r10
	mov	keysize,%eax
	shr	$2,%eax				# 128->4, 192->6, 256->8
	add	$5,%eax			      # 128->9, 192->11, 256->13

aes_loop_initial_enc\num_initial_blocks:
	MOVADQ	(%r10),\TMP1
.irpc	index, \i_seq
	AESENC	\TMP1, %xmm\index
652
.endr
653 654 655 656 657
	add	$16,%r10
	sub	$1,%eax
	jnz	aes_loop_initial_enc\num_initial_blocks

	MOVADQ	(%r10), \TMP1
658
.irpc index, \i_seq
659
	AESENCLAST \TMP1, %xmm\index         # Last Round
660 661 662 663 664 665 666 667 668 669 670 671
.endr
.irpc index, \i_seq
	movdqu	   (%arg3 , %r11, 1), \TMP1
	pxor	   \TMP1, %xmm\index
	movdqu	   %xmm\index, (%arg2 , %r11, 1)
	# write back plaintext/ciphertext for num_initial_blocks
	add	   $16, %r11
	PSHUFB_XMM	   %xmm14, %xmm\index

		# prepare plaintext/ciphertext for GHASH computation
.endr
.endif
672

673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
        # apply GHASH on num_initial_blocks blocks

.if \i == 5
        pxor       %xmm5, %xmm6
	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 6
        pxor       %xmm6, %xmm7
	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.elseif \i == 7
        pxor       %xmm7, %xmm8
	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
	cmp	   $64, %r13
	jl	_initial_blocks_done\num_initial_blocks\operation
	# no need for precomputed values
/*
*
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
699 700 701
	MOVADQ	   ONE(%RIP),\TMP1
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM1
702 703
	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap

704 705
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM2
706 707
	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap

708 709
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM3
710 711
	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap

712 713
	paddd	   \TMP1, \XMM0              # INCR Y0
	MOVADQ	   \XMM0, \XMM4
714 715
	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap

716 717 718 719 720
	MOVADQ	   0(%arg1),\TMP1
	pxor	   \TMP1, \XMM1
	pxor	   \TMP1, \XMM2
	pxor	   \TMP1, \XMM3
	pxor	   \TMP1, \XMM4
721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
	movdqa	   \TMP3, \TMP5
	pshufd	   $78, \TMP3, \TMP1
	pxor	   \TMP3, \TMP1
	movdqa	   \TMP1, HashKey_k(%rsp)
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^2<<1 (mod poly)
	movdqa	   \TMP5, HashKey_2(%rsp)
# HashKey_2 = HashKey^2<<1 (mod poly)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_2_k(%rsp)
.irpc index, 1234 # do 4 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_3(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_3_k(%rsp)
.irpc index, 56789 # do next 5 rounds
	movaps 0x10*\index(%arg1), \TMP1
	AESENC	   \TMP1, \XMM1
	AESENC	   \TMP1, \XMM2
	AESENC	   \TMP1, \XMM3
	AESENC	   \TMP1, \XMM4
.endr
	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
# TMP5 = HashKey^3<<1 (mod poly)
	movdqa	   \TMP5, HashKey_4(%rsp)
	pshufd	   $78, \TMP5, \TMP1
	pxor	   \TMP5, \TMP1
	movdqa	   \TMP1, HashKey_4_k(%rsp)
758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
	lea	   0xa0(%arg1),%r10
	mov	   keysize,%eax
	shr	   $2,%eax			# 128->4, 192->6, 256->8
	sub	   $4,%eax			# 128->0, 192->2, 256->4
	jz	   aes_loop_pre_enc_done\num_initial_blocks

aes_loop_pre_enc\num_initial_blocks:
	MOVADQ	   (%r10),\TMP2
.irpc	index, 1234
	AESENC	   \TMP2, %xmm\index
.endr
	add	   $16,%r10
	sub	   $1,%eax
	jnz	   aes_loop_pre_enc\num_initial_blocks

aes_loop_pre_enc_done\num_initial_blocks:
	MOVADQ	   (%r10), \TMP2
775 776 777 778 779 780 781 782 783 784 785 786
	AESENCLAST \TMP2, \XMM1
	AESENCLAST \TMP2, \XMM2
	AESENCLAST \TMP2, \XMM3
	AESENCLAST \TMP2, \XMM4
	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM1
	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM2
	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM3
	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
	pxor	   \TMP1, \XMM4
787 788 789 790
	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
791

792
	add	   $64, %r11
793
	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
794 795
	pxor	   \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
796 797 798 799
	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap

800
_initial_blocks_done\num_initial_blocks\operation:
801

802 803 804 805 806 807 808 809
.endm

/*
* encrypt 4 blocks at a time
* ghash the 4 previously encrypted ciphertext blocks
* arg1, %arg2, %arg3 are used as pointers only, not modified
* %r11 is the data offset value
*/
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation

	movdqa	  \XMM1, \XMM5
	movdqa	  \XMM2, \XMM6
	movdqa	  \XMM3, \XMM7
	movdqa	  \XMM4, \XMM8

        movdqa    SHUF_MASK(%rip), %xmm15
        # multiply TMP5 * HashKey using karatsuba

	movdqa	  \XMM5, \TMP4
	pshufd	  $78, \XMM5, \TMP6
	pxor	  \XMM5, \TMP6
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
	movdqa    \XMM0, \XMM1
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM2
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM3
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM4
	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

	pxor	  (%arg1), \XMM1
	pxor	  (%arg1), \XMM2
	pxor	  (%arg1), \XMM3
	pxor	  (%arg1), \XMM4
	movdqa	  HashKey_4_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
	movaps 0x10(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 1
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movaps 0x20(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 2
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movdqa	  \XMM6, \TMP1
	pshufd	  $78, \XMM6, \TMP2
	pxor	  \XMM6, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
	movaps 0x30(%arg1), \TMP3
	AESENC    \TMP3, \XMM1              # Round 3
	AESENC    \TMP3, \XMM2
	AESENC    \TMP3, \XMM3
	AESENC    \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
	movaps 0x40(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 4
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_3_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x50(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 5
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM6, \XMM5
	pxor	  \TMP2, \TMP6
	movdqa	  \XMM7, \TMP1
	pshufd	  $78, \XMM7, \TMP2
	pxor	  \XMM7, \TMP2
	movdqa	  HashKey_2(%rsp ), \TMP5

        # Multiply TMP5 * HashKey using karatsuba

	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
	movaps 0x60(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 6
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
	movaps 0x70(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 7
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_2_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x80(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 8
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM7, \XMM5
	pxor	  \TMP2, \TMP6

        # Multiply XMM8 * HashKey
        # XMM8 and TMP5 hold the values for the two operands

	movdqa	  \XMM8, \TMP1
	pshufd	  $78, \XMM8, \TMP2
	pxor	  \XMM8, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
	movaps 0x90(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1            # Round 9
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax			# 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
	jz	  aes_loop_par_enc_done

aes_loop_par_enc:
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
	jnz	  aes_loop_par_enc

aes_loop_par_enc_done:
	MOVADQ	  (%r10), \TMP3
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
	AESENCLAST \TMP3, \XMM1           # Round 10
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
	movdqa    HashKey_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
	movdqu	  (%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
	movdqu	  16(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
	movdqu	  32(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
	movdqu	  48(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

	pxor	  \TMP4, \TMP1
	pxor	  \XMM8, \XMM5
	pxor	  \TMP6, \TMP2
	pxor	  \TMP1, \TMP2
	pxor	  \XMM5, \TMP2
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
	pxor	  \TMP3, \XMM5
	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5

        # first phase of reduction

	movdqa    \XMM5, \TMP2
	movdqa    \XMM5, \TMP3
	movdqa    \XMM5, \TMP4
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
	pslld     $31, \TMP2                   # packed right shift << 31
	pslld     $30, \TMP3                   # packed right shift << 30
	pslld     $25, \TMP4                   # packed right shift << 25
	pxor      \TMP3, \TMP2	               # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5                    # right shift T5 1 DW
	pslldq    $12, \TMP2                   # left shift T2 3 DWs
	pxor      \TMP2, \XMM5

        # second phase of reduction

	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
	movdqa    \XMM5,\TMP3
	movdqa    \XMM5,\TMP4
	psrld     $1, \TMP2                    # packed left shift >>1
	psrld     $2, \TMP3                    # packed left shift >>2
	psrld     $7, \TMP4                    # packed left shift >>7
	pxor      \TMP3,\TMP2		       # xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \XMM5
	pxor      \TMP1, \XMM5                 # result is in TMP1

	pxor	  \XMM5, \XMM1
.endm

/*
* decrypt 4 blocks at a time
* ghash the 4 previously decrypted ciphertext blocks
* arg1, %arg2, %arg3 are used as pointers only, not modified
* %r11 is the data offset value
*/
.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1019 1020 1021 1022 1023 1024 1025
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation

	movdqa	  \XMM1, \XMM5
	movdqa	  \XMM2, \XMM6
	movdqa	  \XMM3, \XMM7
	movdqa	  \XMM4, \XMM8

1026
        movdqa    SHUF_MASK(%rip), %xmm15
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
        # multiply TMP5 * HashKey using karatsuba

	movdqa	  \XMM5, \TMP4
	pshufd	  $78, \XMM5, \TMP6
	pxor	  \XMM5, \TMP6
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
	movdqa    \XMM0, \XMM1
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM2
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM3
	paddd     ONE(%rip), \XMM0		# INCR CNT
	movdqa    \XMM0, \XMM4
1042
	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1043
	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1044 1045 1046 1047
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap

1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
	pxor	  (%arg1), \XMM1
	pxor	  (%arg1), \XMM2
	pxor	  (%arg1), \XMM3
	pxor	  (%arg1), \XMM4
	movdqa	  HashKey_4_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
	movaps 0x10(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 1
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movaps 0x20(%arg1), \TMP1
	AESENC	  \TMP1, \XMM1              # Round 2
	AESENC	  \TMP1, \XMM2
	AESENC	  \TMP1, \XMM3
	AESENC	  \TMP1, \XMM4
	movdqa	  \XMM6, \TMP1
	pshufd	  $78, \XMM6, \TMP2
	pxor	  \XMM6, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
	movaps 0x30(%arg1), \TMP3
	AESENC    \TMP3, \XMM1              # Round 3
	AESENC    \TMP3, \XMM2
	AESENC    \TMP3, \XMM3
	AESENC    \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
	movaps 0x40(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 4
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_3_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x50(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 5
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM6, \XMM5
	pxor	  \TMP2, \TMP6
	movdqa	  \XMM7, \TMP1
	pshufd	  $78, \XMM7, \TMP2
	pxor	  \XMM7, \TMP2
	movdqa	  HashKey_2(%rsp ), \TMP5

        # Multiply TMP5 * HashKey using karatsuba

	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
	movaps 0x60(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1              # Round 6
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
	movaps 0x70(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 7
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	movdqa	  HashKey_2_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
	movaps 0x80(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1             # Round 8
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	pxor	  \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
	pxor	  \XMM7, \XMM5
	pxor	  \TMP2, \TMP6

        # Multiply XMM8 * HashKey
        # XMM8 and TMP5 hold the values for the two operands

	movdqa	  \XMM8, \TMP1
	pshufd	  $78, \XMM8, \TMP2
	pxor	  \XMM8, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
	movaps 0x90(%arg1), \TMP3
	AESENC	  \TMP3, \XMM1            # Round 9
	AESENC	  \TMP3, \XMM2
	AESENC	  \TMP3, \XMM3
	AESENC	  \TMP3, \XMM4
	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
	lea	  0xa0(%arg1),%r10
	mov	  keysize,%eax
	shr	  $2,%eax		        # 128->4, 192->6, 256->8
	sub	  $4,%eax			# 128->0, 192->2, 256->4
	jz	  aes_loop_par_dec_done

aes_loop_par_dec:
	MOVADQ	  (%r10),\TMP3
.irpc	index, 1234
	AESENC	  \TMP3, %xmm\index
.endr
	add	  $16,%r10
	sub	  $1,%eax
	jnz	  aes_loop_par_dec

aes_loop_par_dec_done:
	MOVADQ	  (%r10), \TMP3
	AESENCLAST \TMP3, \XMM1           # last round
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
	AESENCLAST \TMP3, \XMM2
	AESENCLAST \TMP3, \XMM3
	AESENCLAST \TMP3, \XMM4
	movdqa    HashKey_k(%rsp), \TMP5
	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
	movdqu	  (%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
	movdqa    \TMP3, \XMM1
	movdqu	  16(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
	movdqa    \TMP3, \XMM2
	movdqu	  32(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
	movdqa    \TMP3, \XMM3
	movdqu	  48(%arg3,%r11,1), \TMP3
	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
	movdqa    \TMP3, \XMM4
1175 1176 1177 1178
	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321

	pxor	  \TMP4, \TMP1
	pxor	  \XMM8, \XMM5
	pxor	  \TMP6, \TMP2
	pxor	  \TMP1, \TMP2
	pxor	  \XMM5, \TMP2
	movdqa	  \TMP2, \TMP3
	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
	pxor	  \TMP3, \XMM5
	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5

        # first phase of reduction

	movdqa    \XMM5, \TMP2
	movdqa    \XMM5, \TMP3
	movdqa    \XMM5, \TMP4
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
	pslld     $31, \TMP2                   # packed right shift << 31
	pslld     $30, \TMP3                   # packed right shift << 30
	pslld     $25, \TMP4                   # packed right shift << 25
	pxor      \TMP3, \TMP2	               # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP5
	psrldq    $4, \TMP5                    # right shift T5 1 DW
	pslldq    $12, \TMP2                   # left shift T2 3 DWs
	pxor      \TMP2, \XMM5

        # second phase of reduction

	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
	movdqa    \XMM5,\TMP3
	movdqa    \XMM5,\TMP4
	psrld     $1, \TMP2                    # packed left shift >>1
	psrld     $2, \TMP3                    # packed left shift >>2
	psrld     $7, \TMP4                    # packed left shift >>7
	pxor      \TMP3,\TMP2		       # xor the shifted versions
	pxor      \TMP4,\TMP2
	pxor      \TMP5, \TMP2
	pxor      \TMP2, \XMM5
	pxor      \TMP1, \XMM5                 # result is in TMP1

	pxor	  \XMM5, \XMM1
.endm

/* GHASH the last 4 ciphertext blocks. */
.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst

        # Multiply TMP6 * HashKey (using Karatsuba)

	movdqa	  \XMM1, \TMP6
	pshufd	  $78, \XMM1, \TMP2
	pxor	  \XMM1, \TMP2
	movdqa	  HashKey_4(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
	movdqa	  HashKey_4_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	movdqa	  \XMM1, \XMMDst
	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM2, \TMP1
	pshufd	  $78, \XMM2, \TMP2
	pxor	  \XMM2, \TMP2
	movdqa	  HashKey_3(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
	movdqa	  HashKey_3_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM2, \XMMDst
	pxor	  \TMP2, \XMM1
# results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)

	movdqa	  \XMM3, \TMP1
	pshufd	  $78, \XMM3, \TMP2
	pxor	  \XMM3, \TMP2
	movdqa	  HashKey_2(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
	movdqa	  HashKey_2_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM3, \XMMDst
	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1

        # Multiply TMP1 * HashKey (using Karatsuba)
	movdqa	  \XMM4, \TMP1
	pshufd	  $78, \XMM4, \TMP2
	pxor	  \XMM4, \TMP2
	movdqa	  HashKey(%rsp), \TMP5
	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
	movdqa	  HashKey_k(%rsp), \TMP4
	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
	pxor	  \TMP1, \TMP6
	pxor	  \XMM4, \XMMDst
	pxor	  \XMM1, \TMP2
	pxor	  \TMP6, \TMP2
	pxor	  \XMMDst, \TMP2
	# middle section of the temp results combined as in karatsuba algorithm
	movdqa	  \TMP2, \TMP4
	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
	pxor	  \TMP4, \XMMDst
	pxor	  \TMP2, \TMP6
# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
	# first phase of the reduction
	movdqa    \XMMDst, \TMP2
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
	pslld     $31, \TMP2                # packed right shifting << 31
	pslld     $30, \TMP3                # packed right shifting << 30
	pslld     $25, \TMP4                # packed right shifting << 25
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	movdqa    \TMP2, \TMP7
	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
	pxor      \TMP2, \XMMDst

        # second phase of the reduction
	movdqa    \XMMDst, \TMP2
	# make 3 copies of XMMDst for doing 3 shift operations
	movdqa    \XMMDst, \TMP3
	movdqa    \XMMDst, \TMP4
	psrld     $1, \TMP2                 # packed left shift >> 1
	psrld     $2, \TMP3                 # packed left shift >> 2
	psrld     $7, \TMP4                 # packed left shift >> 7
	pxor      \TMP3, \TMP2              # xor the shifted versions
	pxor      \TMP4, \TMP2
	pxor      \TMP7, \TMP2
	pxor      \TMP2, \XMMDst
	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
.endm


1322 1323 1324
/* Encryption of a single block
* uses eax & r10
*/
1325

1326
.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1327

1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343
	pxor		(%arg1), \XMM0
	mov		keysize,%eax
	shr		$2,%eax			# 128->4, 192->6, 256->8
	add		$5,%eax			# 128->9, 192->11, 256->13
	lea		16(%arg1), %r10	  # get first expanded key address

_esb_loop_\@:
	MOVADQ		(%r10),\TMP1
	AESENC		\TMP1,\XMM0
	add		$16,%r10
	sub		$1,%eax
	jnz		_esb_loop_\@

	MOVADQ		(%r10),\TMP1
	AESENCLAST	\TMP1,\XMM0
.endm
1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
*                   const u8 *in,      // Ciphertext input
*                   u64 plaintext_len, // Length of data in bytes for decryption.
*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                   const u8 *aad,     // Additional Authentication Data (AAD)
*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
*                                      // given authentication tag and only return the plaintext if they match.
*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
*                                      // (most likely), 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the first
*       set of 11 keys in the data structure void *aes_ctx
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                       AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                        AAD Format with 64-bit Extended Sequence Number
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/
ENTRY(aesni_gcm_dec)
	push	%r12
	push	%r13
	push	%r14
	mov	%rsp, %r14
/*
* states of %xmm registers %xmm6:%xmm15 not saved
* all %xmm registers are clobbered
*/
	sub	$VARIABLE_OFFSET, %rsp
	and	$~63, %rsp                        # align rsp to 64 bytes
	mov	%arg6, %r12
	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1435 1436 1437
        movdqa  SHUF_MASK(%rip), %xmm2
	PSHUFB_XMM %xmm2, %xmm13

1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468

# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)

	movdqa	%xmm13, %xmm2
	psllq	$1, %xmm13
	psrlq	$63, %xmm2
	movdqa	%xmm2, %xmm1
	pslldq	$8, %xmm2
	psrldq	$8, %xmm1
	por	%xmm2, %xmm13

        # Reduction

	pshufd	$0x24, %xmm1, %xmm2
	pcmpeqd TWOONE(%rip), %xmm2
	pand	POLY(%rip), %xmm2
	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)


        # Decrypt first few blocks

	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
	mov %r13, %r12
	and $(3<<4), %r12
	jz _initial_num_blocks_is_0_decrypt
	cmp $(2<<4), %r12
	jb _initial_num_blocks_is_1_decrypt
	je _initial_num_blocks_is_2_decrypt
_initial_num_blocks_is_3_decrypt:
1469
	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1470 1471 1472 1473
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
	sub	$48, %r13
	jmp	_initial_blocks_decrypted
_initial_num_blocks_is_2_decrypt:
1474
	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1475 1476 1477 1478
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
	sub	$32, %r13
	jmp	_initial_blocks_decrypted
_initial_num_blocks_is_1_decrypt:
1479
	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1480 1481 1482 1483
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
	sub	$16, %r13
	jmp	_initial_blocks_decrypted
_initial_num_blocks_is_0_decrypt:
1484
	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1485 1486 1487 1488 1489 1490 1491
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
_initial_blocks_decrypted:
	cmp	$0, %r13
	je	_zero_cipher_left_decrypt
	sub	$64, %r13
	je	_four_cipher_left_decrypt
_decrypt_by_4:
1492
	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504
%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
	add	$64, %r11
	sub	$64, %r13
	jne	_decrypt_by_4
_four_cipher_left_decrypt:
	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
_zero_cipher_left_decrypt:
	mov	%arg4, %r13
	and	$15, %r13				# %r13 = arg4 (mod 16)
	je	_multiple_of_16_bytes_decrypt

L
Lucas De Marchi 已提交
1505
        # Handle the last <16 byte block separately
1506 1507

	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1508 1509 1510
        movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10, %xmm0

1511
	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1512

1513 1514 1515 1516 1517 1518
	lea (%arg3,%r11,1), %r10
	mov %r13, %r12
	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1

	lea ALL_F+16(%rip), %r12
	sub %r13, %r12
1519 1520
	movdqa  %xmm1, %xmm2
	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1521
	movdqu (%r12), %xmm1
1522 1523 1524
	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
	pand    %xmm1, %xmm2
1525 1526 1527
        movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10 ,%xmm2

1528 1529 1530 1531
	pxor %xmm2, %xmm8
	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6

        # output %r13 bytes
1532
	MOVQ_R64_XMM	%xmm0, %rax
1533 1534 1535 1536 1537
	cmp	$8, %r13
	jle	_less_than_8_bytes_left_decrypt
	mov	%rax, (%arg2 , %r11, 1)
	add	$8, %r11
	psrldq	$8, %xmm0
1538
	MOVQ_R64_XMM	%xmm0, %rax
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
	sub	$8, %r13
_less_than_8_bytes_left_decrypt:
	mov	%al,  (%arg2, %r11, 1)
	add	$1, %r11
	shr	$8, %rax
	sub	$1, %r13
	jne	_less_than_8_bytes_left_decrypt
_multiple_of_16_bytes_decrypt:
	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
	shl	$3, %r12		  # convert into number of bits
	movd	%r12d, %xmm15		  # len(A) in %xmm15
	shl	$3, %arg4		  # len(C) in bits (*128)
1551
	MOVQ_R64_XMM	%arg4, %xmm1
1552 1553 1554 1555 1556
	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
	pxor	%xmm15, %xmm8
	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	         # final GHASH computation
1557 1558 1559
        movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10, %xmm8

1560 1561 1562 1563 1564 1565 1566 1567 1568
	mov	%arg5, %rax		  # %rax = *Y0
	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
	pxor	%xmm8, %xmm0
_return_T_decrypt:
	mov	arg9, %r10                # %r10 = authTag
	mov	arg10, %r11               # %r11 = auth_tag_len
	cmp	$16, %r11
	je	_T_16_decrypt
1569 1570
	cmp	$8, %r11
	jl	_T_4_decrypt
1571
_T_8_decrypt:
1572
	MOVQ_R64_XMM	%xmm0, %rax
1573
	mov	%rax, (%r10)
1574 1575
	add	$8, %r10
	sub	$8, %r11
1576
	psrldq	$8, %xmm0
1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587
	cmp	$0, %r11
	je	_return_T_done_decrypt
_T_4_decrypt:
	movd	%xmm0, %eax
	mov	%eax, (%r10)
	add	$4, %r10
	sub	$4, %r11
	psrldq	$4, %xmm0
	cmp	$0, %r11
	je	_return_T_done_decrypt
_T_123_decrypt:
1588
	movd	%xmm0, %eax
1589 1590 1591 1592 1593 1594 1595 1596 1597
	cmp	$2, %r11
	jl	_T_1_decrypt
	mov	%ax, (%r10)
	cmp	$2, %r11
	je	_return_T_done_decrypt
	add	$2, %r10
	sar	$16, %eax
_T_1_decrypt:
	mov	%al, (%r10)
1598 1599 1600 1601 1602 1603 1604 1605 1606
	jmp	_return_T_done_decrypt
_T_16_decrypt:
	movdqu	%xmm0, (%r10)
_return_T_done_decrypt:
	mov	%r14, %rsp
	pop	%r14
	pop	%r13
	pop	%r12
	ret
1607
ENDPROC(aesni_gcm_dec)
1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697


/*****************************************************************************
* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
*                    const u8 *in,       // Plaintext input
*                    u64 plaintext_len,  // Length of data in bytes for encryption.
*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
*                    const u8 *aad,      // Additional Authentication Data (AAD)
*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
*                    u8 *auth_tag,       // Authenticated Tag output.
*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
*                                        // 12 or 8.
*
* Assumptions:
*
* keys:
*       keys are pre-expanded and aligned to 16 bytes. we are using the
*       first set of 11 keys in the data structure void *aes_ctx
*
*
* iv:
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                             Salt  (From the SA)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     Initialization Vector                     |
*       |         (This is the sequence number from IPSec header)       |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x1                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*
*
* AAD:
*       AAD padded to 128 bits with 0
*       for example, assume AAD is a u32 vector
*
*       if AAD is 8 bytes:
*       AAD[3] = {A0, A1};
*       padded AAD in xmm register = {A1 A0 0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A1)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                     32-bit Sequence Number (A0)               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                                 AAD Format with 32-bit Sequence Number
*
*       if AAD is 12 bytes:
*       AAD[3] = {A0, A1, A2};
*       padded AAD in xmm register = {A2 A1 A0 0}
*
*       0                   1                   2                   3
*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                               SPI (A2)                        |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                 64-bit Extended Sequence Number {A1,A0}       |
*       |                                                               |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*       |                              0x0                              |
*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
*                         AAD Format with 64-bit Extended Sequence Number
*
* poly = x^128 + x^127 + x^126 + x^121 + 1
***************************************************************************/
ENTRY(aesni_gcm_enc)
	push	%r12
	push	%r13
	push	%r14
	mov	%rsp, %r14
#
# states of %xmm registers %xmm6:%xmm15 not saved
# all %xmm registers are clobbered
#
	sub	$VARIABLE_OFFSET, %rsp
	and	$~63, %rsp
	mov	%arg6, %r12
	movdqu	(%r12), %xmm13
1698 1699 1700
        movdqa  SHUF_MASK(%rip), %xmm2
	PSHUFB_XMM %xmm2, %xmm13

1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730

# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)

	movdqa	%xmm13, %xmm2
	psllq	$1, %xmm13
	psrlq	$63, %xmm2
	movdqa	%xmm2, %xmm1
	pslldq	$8, %xmm2
	psrldq	$8, %xmm1
	por	%xmm2, %xmm13

        # reduce HashKey<<1

	pshufd	$0x24, %xmm1, %xmm2
	pcmpeqd TWOONE(%rip), %xmm2
	pand	POLY(%rip), %xmm2
	pxor	%xmm2, %xmm13
	movdqa	%xmm13, HashKey(%rsp)
	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
	and	$-16, %r13
	mov	%r13, %r12

        # Encrypt first few blocks

	and	$(3<<4), %r12
	jz	_initial_num_blocks_is_0_encrypt
	cmp	$(2<<4), %r12
	jb	_initial_num_blocks_is_1_encrypt
	je	_initial_num_blocks_is_2_encrypt
_initial_num_blocks_is_3_encrypt:
1731
	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1732 1733 1734 1735
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
	sub	$48, %r13
	jmp	_initial_blocks_encrypted
_initial_num_blocks_is_2_encrypt:
1736
	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1737 1738 1739 1740
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
	sub	$32, %r13
	jmp	_initial_blocks_encrypted
_initial_num_blocks_is_1_encrypt:
1741
	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1742 1743 1744 1745
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
	sub	$16, %r13
	jmp	_initial_blocks_encrypted
_initial_num_blocks_is_0_encrypt:
1746
	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
_initial_blocks_encrypted:

        # Main loop - Encrypt remaining blocks

	cmp	$0, %r13
	je	_zero_cipher_left_encrypt
	sub	$64, %r13
	je	_four_cipher_left_encrypt
_encrypt_by_4_encrypt:
1757
	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769
%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
	add	$64, %r11
	sub	$64, %r13
	jne	_encrypt_by_4_encrypt
_four_cipher_left_encrypt:
	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
_zero_cipher_left_encrypt:
	mov	%arg4, %r13
	and	$15, %r13			# %r13 = arg4 (mod 16)
	je	_multiple_of_16_bytes_encrypt

L
Lucas De Marchi 已提交
1770
         # Handle the last <16 Byte block separately
1771
	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1772 1773 1774
        movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10, %xmm0

1775
	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1776 1777 1778 1779 1780 1781

	lea (%arg3,%r11,1), %r10
	mov %r13, %r12
	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1

	lea ALL_F+16(%rip), %r12
1782 1783
	sub %r13, %r12
	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1784
	movdqu	(%r12), %xmm1
1785 1786
	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1787 1788
        movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10,%xmm0
1789 1790 1791 1792

	pxor	%xmm0, %xmm8
	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	# GHASH computation for the last <16 byte block
1793 1794
	movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10, %xmm0
1795

1796 1797 1798
	# shuffle xmm0 back to output as ciphertext

        # Output %r13 bytes
1799
	MOVQ_R64_XMM %xmm0, %rax
1800 1801 1802 1803 1804
	cmp $8, %r13
	jle _less_than_8_bytes_left_encrypt
	mov %rax, (%arg2 , %r11, 1)
	add $8, %r11
	psrldq $8, %xmm0
1805
	MOVQ_R64_XMM %xmm0, %rax
1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817
	sub $8, %r13
_less_than_8_bytes_left_encrypt:
	mov %al,  (%arg2, %r11, 1)
	add $1, %r11
	shr $8, %rax
	sub $1, %r13
	jne _less_than_8_bytes_left_encrypt
_multiple_of_16_bytes_encrypt:
	mov	arg8, %r12    # %r12 = addLen (number of bytes)
	shl	$3, %r12
	movd	%r12d, %xmm15       # len(A) in %xmm15
	shl	$3, %arg4               # len(C) in bits (*128)
1818
	MOVQ_R64_XMM	%arg4, %xmm1
1819 1820 1821 1822 1823
	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
	pxor	%xmm15, %xmm8
	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
	# final GHASH computation
1824 1825
        movdqa SHUF_MASK(%rip), %xmm10
	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1826 1827 1828 1829 1830 1831 1832 1833 1834 1835

	mov	%arg5, %rax		       # %rax  = *Y0
	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
	pxor	%xmm8, %xmm0
_return_T_encrypt:
	mov	arg9, %r10                     # %r10 = authTag
	mov	arg10, %r11                    # %r11 = auth_tag_len
	cmp	$16, %r11
	je	_T_16_encrypt
1836 1837
	cmp	$8, %r11
	jl	_T_4_encrypt
1838
_T_8_encrypt:
1839
	MOVQ_R64_XMM	%xmm0, %rax
1840
	mov	%rax, (%r10)
1841 1842
	add	$8, %r10
	sub	$8, %r11
1843
	psrldq	$8, %xmm0
1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854
	cmp	$0, %r11
	je	_return_T_done_encrypt
_T_4_encrypt:
	movd	%xmm0, %eax
	mov	%eax, (%r10)
	add	$4, %r10
	sub	$4, %r11
	psrldq	$4, %xmm0
	cmp	$0, %r11
	je	_return_T_done_encrypt
_T_123_encrypt:
1855
	movd	%xmm0, %eax
1856 1857 1858 1859 1860 1861 1862 1863 1864
	cmp	$2, %r11
	jl	_T_1_encrypt
	mov	%ax, (%r10)
	cmp	$2, %r11
	je	_return_T_done_encrypt
	add	$2, %r10
	sar	$16, %eax
_T_1_encrypt:
	mov	%al, (%r10)
1865 1866 1867 1868 1869 1870 1871 1872 1873
	jmp	_return_T_done_encrypt
_T_16_encrypt:
	movdqu	%xmm0, (%r10)
_return_T_done_encrypt:
	mov	%r14, %rsp
	pop	%r14
	pop	%r13
	pop	%r12
	ret
1874
ENDPROC(aesni_gcm_enc)
1875

1876
#endif
1877 1878


1879
.align 4
1880 1881 1882 1883 1884 1885 1886 1887
_key_expansion_128:
_key_expansion_256a:
	pshufd $0b11111111, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0
1888 1889
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1890
	ret
1891 1892
ENDPROC(_key_expansion_128)
ENDPROC(_key_expansion_256a)
1893

1894
.align 4
1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911
_key_expansion_192a:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	movaps %xmm2, %xmm6
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

	movaps %xmm0, %xmm1
	shufps $0b01000100, %xmm0, %xmm6
1912
	movaps %xmm6, (TKEYP)
1913
	shufps $0b01001110, %xmm2, %xmm1
1914 1915
	movaps %xmm1, 0x10(TKEYP)
	add $0x20, TKEYP
1916
	ret
1917
ENDPROC(_key_expansion_192a)
1918

1919
.align 4
1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933
_key_expansion_192b:
	pshufd $0b01010101, %xmm1, %xmm1
	shufps $0b00010000, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	shufps $0b10001100, %xmm0, %xmm4
	pxor %xmm4, %xmm0
	pxor %xmm1, %xmm0

	movaps %xmm2, %xmm5
	pslldq $4, %xmm5
	pshufd $0b11111111, %xmm0, %xmm3
	pxor %xmm3, %xmm2
	pxor %xmm5, %xmm2

1934 1935
	movaps %xmm0, (TKEYP)
	add $0x10, TKEYP
1936
	ret
1937
ENDPROC(_key_expansion_192b)
1938

1939
.align 4
1940 1941 1942 1943 1944 1945 1946
_key_expansion_256b:
	pshufd $0b10101010, %xmm1, %xmm1
	shufps $0b00010000, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	shufps $0b10001100, %xmm2, %xmm4
	pxor %xmm4, %xmm2
	pxor %xmm1, %xmm2
1947 1948
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1949
	ret
1950
ENDPROC(_key_expansion_256b)
1951 1952 1953 1954 1955 1956

/*
 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 *                   unsigned int key_len)
 */
ENTRY(aesni_set_key)
1957
	FRAME_BEGIN
1958 1959
#ifndef __x86_64__
	pushl KEYP
1960 1961 1962
	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1963 1964 1965 1966 1967
#endif
	movups (UKEYP), %xmm0		# user key (first 16 bytes)
	movaps %xmm0, (KEYP)
	lea 0x10(KEYP), TKEYP		# key addr
	movl %edx, 480(KEYP)
1968 1969 1970 1971
	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
	cmp $24, %dl
	jb .Lenc_key128
	je .Lenc_key192
1972 1973 1974
	movups 0x10(UKEYP), %xmm2	# other user key
	movaps %xmm2, (TKEYP)
	add $0x10, TKEYP
1975
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1976
	call _key_expansion_256a
1977
	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1978
	call _key_expansion_256b
1979
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1980
	call _key_expansion_256a
1981
	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1982
	call _key_expansion_256b
1983
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1984
	call _key_expansion_256a
1985
	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1986
	call _key_expansion_256b
1987
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1988
	call _key_expansion_256a
1989
	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1990
	call _key_expansion_256b
1991
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1992
	call _key_expansion_256a
1993
	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1994
	call _key_expansion_256b
1995
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1996
	call _key_expansion_256a
1997
	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1998
	call _key_expansion_256b
1999
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
2000 2001 2002
	call _key_expansion_256a
	jmp .Ldec_key
.Lenc_key192:
2003
	movq 0x10(UKEYP), %xmm2		# other user key
2004
	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
2005
	call _key_expansion_192a
2006
	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
2007
	call _key_expansion_192b
2008
	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
2009
	call _key_expansion_192a
2010
	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
2011
	call _key_expansion_192b
2012
	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
2013
	call _key_expansion_192a
2014
	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
2015
	call _key_expansion_192b
2016
	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
2017
	call _key_expansion_192a
2018
	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
2019 2020 2021
	call _key_expansion_192b
	jmp .Ldec_key
.Lenc_key128:
2022
	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
2023
	call _key_expansion_128
2024
	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
2025
	call _key_expansion_128
2026
	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
2027
	call _key_expansion_128
2028
	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
2029
	call _key_expansion_128
2030
	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
2031
	call _key_expansion_128
2032
	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
2033
	call _key_expansion_128
2034
	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
2035
	call _key_expansion_128
2036
	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
2037
	call _key_expansion_128
2038
	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
2039
	call _key_expansion_128
2040
	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
2041 2042
	call _key_expansion_128
.Ldec_key:
2043 2044 2045 2046 2047 2048 2049
	sub $0x10, TKEYP
	movaps (KEYP), %xmm0
	movaps (TKEYP), %xmm1
	movaps %xmm0, 240(TKEYP)
	movaps %xmm1, 240(KEYP)
	add $0x10, KEYP
	lea 240-16(TKEYP), UKEYP
2050 2051
.align 4
.Ldec_key_loop:
2052
	movaps (KEYP), %xmm0
2053
	AESIMC %xmm0 %xmm1
2054 2055 2056 2057
	movaps %xmm1, (UKEYP)
	add $0x10, KEYP
	sub $0x10, UKEYP
	cmp TKEYP, KEYP
2058
	jb .Ldec_key_loop
2059 2060 2061 2062
	xor AREG, AREG
#ifndef __x86_64__
	popl KEYP
#endif
2063
	FRAME_END
2064
	ret
2065
ENDPROC(aesni_set_key)
2066 2067 2068 2069 2070

/*
 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_enc)
2071
	FRAME_BEGIN
2072 2073 2074
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
2075 2076 2077
	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+20)(%esp), INP	# src
2078
#endif
2079 2080 2081 2082
	movl 480(KEYP), KLEN		# key length
	movups (INP), STATE		# input
	call _aesni_enc1
	movups STATE, (OUTP)		# output
2083 2084 2085 2086
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
2087
	FRAME_END
2088
	ret
2089
ENDPROC(aesni_enc)
2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102

/*
 * _aesni_enc1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
2103
.align 4
2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114
_aesni_enc1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Lenc128
	lea 0x20(TKEYP), TKEYP
	je .Lenc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2115
	AESENC KEY STATE
2116
	movaps -0x50(TKEYP), KEY
2117
	AESENC KEY STATE
2118 2119 2120
.align 4
.Lenc192:
	movaps -0x40(TKEYP), KEY
2121
	AESENC KEY STATE
2122
	movaps -0x30(TKEYP), KEY
2123
	AESENC KEY STATE
2124 2125 2126
.align 4
.Lenc128:
	movaps -0x20(TKEYP), KEY
2127
	AESENC KEY STATE
2128
	movaps -0x10(TKEYP), KEY
2129
	AESENC KEY STATE
2130
	movaps (TKEYP), KEY
2131
	AESENC KEY STATE
2132
	movaps 0x10(TKEYP), KEY
2133
	AESENC KEY STATE
2134
	movaps 0x20(TKEYP), KEY
2135
	AESENC KEY STATE
2136
	movaps 0x30(TKEYP), KEY
2137
	AESENC KEY STATE
2138
	movaps 0x40(TKEYP), KEY
2139
	AESENC KEY STATE
2140
	movaps 0x50(TKEYP), KEY
2141
	AESENC KEY STATE
2142
	movaps 0x60(TKEYP), KEY
2143
	AESENC KEY STATE
2144
	movaps 0x70(TKEYP), KEY
2145
	AESENCLAST KEY STATE
2146
	ret
2147
ENDPROC(_aesni_enc1)
2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166

/*
 * _aesni_enc4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		round count
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
2167
.align 4
2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181
_aesni_enc4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4enc128
	lea 0x20(TKEYP), TKEYP
	je .L4enc192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2182 2183 2184 2185
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2186
	movaps -0x50(TKEYP), KEY
2187 2188 2189 2190
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2191 2192 2193
#.align 4
.L4enc192:
	movaps -0x40(TKEYP), KEY
2194 2195 2196 2197
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2198
	movaps -0x30(TKEYP), KEY
2199 2200 2201 2202
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2203 2204 2205
#.align 4
.L4enc128:
	movaps -0x20(TKEYP), KEY
2206 2207 2208 2209
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2210
	movaps -0x10(TKEYP), KEY
2211 2212 2213 2214
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2215
	movaps (TKEYP), KEY
2216 2217 2218 2219
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2220
	movaps 0x10(TKEYP), KEY
2221 2222 2223 2224
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2225
	movaps 0x20(TKEYP), KEY
2226 2227 2228 2229
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2230
	movaps 0x30(TKEYP), KEY
2231 2232 2233 2234
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2235
	movaps 0x40(TKEYP), KEY
2236 2237 2238 2239
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2240
	movaps 0x50(TKEYP), KEY
2241 2242 2243 2244
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2245
	movaps 0x60(TKEYP), KEY
2246 2247 2248 2249
	AESENC KEY STATE1
	AESENC KEY STATE2
	AESENC KEY STATE3
	AESENC KEY STATE4
2250
	movaps 0x70(TKEYP), KEY
2251 2252 2253 2254
	AESENCLAST KEY STATE1		# last round
	AESENCLAST KEY STATE2
	AESENCLAST KEY STATE3
	AESENCLAST KEY STATE4
2255
	ret
2256
ENDPROC(_aesni_enc4)
2257 2258 2259 2260 2261

/*
 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
 */
ENTRY(aesni_dec)
2262
	FRAME_BEGIN
2263 2264 2265
#ifndef __x86_64__
	pushl KEYP
	pushl KLEN
2266 2267 2268
	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+20)(%esp), INP	# src
2269
#endif
2270 2271 2272 2273 2274
	mov 480(KEYP), KLEN		# key length
	add $240, KEYP
	movups (INP), STATE		# input
	call _aesni_dec1
	movups STATE, (OUTP)		#output
2275 2276 2277 2278
#ifndef __x86_64__
	popl KLEN
	popl KEYP
#endif
2279
	FRAME_END
2280
	ret
2281
ENDPROC(aesni_dec)
2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294

/*
 * _aesni_dec1:		internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE:		initial state (input)
 * output:
 *	STATE:		finial state (output)
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
2295
.align 4
2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306
_aesni_dec1:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE		# round 0
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .Ldec128
	lea 0x20(TKEYP), TKEYP
	je .Ldec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2307
	AESDEC KEY STATE
2308
	movaps -0x50(TKEYP), KEY
2309
	AESDEC KEY STATE
2310 2311 2312
.align 4
.Ldec192:
	movaps -0x40(TKEYP), KEY
2313
	AESDEC KEY STATE
2314
	movaps -0x30(TKEYP), KEY
2315
	AESDEC KEY STATE
2316 2317 2318
.align 4
.Ldec128:
	movaps -0x20(TKEYP), KEY
2319
	AESDEC KEY STATE
2320
	movaps -0x10(TKEYP), KEY
2321
	AESDEC KEY STATE
2322
	movaps (TKEYP), KEY
2323
	AESDEC KEY STATE
2324
	movaps 0x10(TKEYP), KEY
2325
	AESDEC KEY STATE
2326
	movaps 0x20(TKEYP), KEY
2327
	AESDEC KEY STATE
2328
	movaps 0x30(TKEYP), KEY
2329
	AESDEC KEY STATE
2330
	movaps 0x40(TKEYP), KEY
2331
	AESDEC KEY STATE
2332
	movaps 0x50(TKEYP), KEY
2333
	AESDEC KEY STATE
2334
	movaps 0x60(TKEYP), KEY
2335
	AESDEC KEY STATE
2336
	movaps 0x70(TKEYP), KEY
2337
	AESDECLAST KEY STATE
2338
	ret
2339
ENDPROC(_aesni_dec1)
2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358

/*
 * _aesni_dec4:	internal ABI
 * input:
 *	KEYP:		key struct pointer
 *	KLEN:		key length
 *	STATE1:		initial state (input)
 *	STATE2
 *	STATE3
 *	STATE4
 * output:
 *	STATE1:		finial state (output)
 *	STATE2
 *	STATE3
 *	STATE4
 * changed:
 *	KEY
 *	TKEYP (T1)
 */
2359
.align 4
2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373
_aesni_dec4:
	movaps (KEYP), KEY		# key
	mov KEYP, TKEYP
	pxor KEY, STATE1		# round 0
	pxor KEY, STATE2
	pxor KEY, STATE3
	pxor KEY, STATE4
	add $0x30, TKEYP
	cmp $24, KLEN
	jb .L4dec128
	lea 0x20(TKEYP), TKEYP
	je .L4dec192
	add $0x20, TKEYP
	movaps -0x60(TKEYP), KEY
2374 2375 2376 2377
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2378
	movaps -0x50(TKEYP), KEY
2379 2380 2381 2382
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2383 2384 2385
.align 4
.L4dec192:
	movaps -0x40(TKEYP), KEY
2386 2387 2388 2389
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2390
	movaps -0x30(TKEYP), KEY
2391 2392 2393 2394
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2395 2396 2397
.align 4
.L4dec128:
	movaps -0x20(TKEYP), KEY
2398 2399 2400 2401
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2402
	movaps -0x10(TKEYP), KEY
2403 2404 2405 2406
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2407
	movaps (TKEYP), KEY
2408 2409 2410 2411
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2412
	movaps 0x10(TKEYP), KEY
2413 2414 2415 2416
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2417
	movaps 0x20(TKEYP), KEY
2418 2419 2420 2421
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2422
	movaps 0x30(TKEYP), KEY
2423 2424 2425 2426
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2427
	movaps 0x40(TKEYP), KEY
2428 2429 2430 2431
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2432
	movaps 0x50(TKEYP), KEY
2433 2434 2435 2436
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2437
	movaps 0x60(TKEYP), KEY
2438 2439 2440 2441
	AESDEC KEY STATE1
	AESDEC KEY STATE2
	AESDEC KEY STATE3
	AESDEC KEY STATE4
2442
	movaps 0x70(TKEYP), KEY
2443 2444 2445 2446
	AESDECLAST KEY STATE1		# last round
	AESDECLAST KEY STATE2
	AESDECLAST KEY STATE3
	AESDECLAST KEY STATE4
2447
	ret
2448
ENDPROC(_aesni_dec4)
2449 2450 2451 2452 2453 2454

/*
 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len)
 */
ENTRY(aesni_ecb_enc)
2455
	FRAME_BEGIN
2456 2457 2458 2459
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
2460 2461 2462 2463
	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+24)(%esp), INP	# src
	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2464
#endif
2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500
	test LEN, LEN		# check length
	jz .Lecb_enc_ret
	mov 480(KEYP), KLEN
	cmp $16, LEN
	jb .Lecb_enc_ret
	cmp $64, LEN
	jb .Lecb_enc_loop1
.align 4
.Lecb_enc_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_enc4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_enc_loop4
	cmp $16, LEN
	jb .Lecb_enc_ret
.align 4
.Lecb_enc_loop1:
	movups (INP), STATE1
	call _aesni_enc1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_enc_loop1
.Lecb_enc_ret:
2501 2502 2503 2504 2505
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
2506
	FRAME_END
2507
	ret
2508
ENDPROC(aesni_ecb_enc)
2509 2510 2511 2512 2513 2514

/*
 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len);
 */
ENTRY(aesni_ecb_dec)
2515
	FRAME_BEGIN
2516 2517 2518 2519
#ifndef __x86_64__
	pushl LEN
	pushl KEYP
	pushl KLEN
2520 2521 2522 2523
	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+24)(%esp), INP	# src
	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2524
#endif
2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561
	test LEN, LEN
	jz .Lecb_dec_ret
	mov 480(KEYP), KLEN
	add $240, KEYP
	cmp $16, LEN
	jb .Lecb_dec_ret
	cmp $64, LEN
	jb .Lecb_dec_loop1
.align 4
.Lecb_dec_loop4:
	movups (INP), STATE1
	movups 0x10(INP), STATE2
	movups 0x20(INP), STATE3
	movups 0x30(INP), STATE4
	call _aesni_dec4
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lecb_dec_loop4
	cmp $16, LEN
	jb .Lecb_dec_ret
.align 4
.Lecb_dec_loop1:
	movups (INP), STATE1
	call _aesni_dec1
	movups STATE1, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lecb_dec_loop1
.Lecb_dec_ret:
2562 2563 2564 2565 2566
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
#endif
2567
	FRAME_END
2568
	ret
2569
ENDPROC(aesni_ecb_dec)
2570 2571 2572 2573 2574 2575

/*
 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_enc)
2576
	FRAME_BEGIN
2577 2578 2579 2580 2581
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
2582 2583 2584 2585 2586
	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+28)(%esp), INP	# src
	movl (FRAME_OFFSET+32)(%esp), LEN	# len
	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2587
#endif
2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
	cmp $16, LEN
	jb .Lcbc_enc_ret
	mov 480(KEYP), KLEN
	movups (IVP), STATE	# load iv as initial state
.align 4
.Lcbc_enc_loop:
	movups (INP), IN	# load input
	pxor IN, STATE
	call _aesni_enc1
	movups STATE, (OUTP)	# store output
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_enc_loop
	movups STATE, (IVP)
.Lcbc_enc_ret:
2605 2606 2607 2608 2609 2610
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
2611
	FRAME_END
2612
	ret
2613
ENDPROC(aesni_cbc_enc)
2614 2615 2616 2617 2618 2619

/*
 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_cbc_dec)
2620
	FRAME_BEGIN
2621 2622 2623 2624 2625
#ifndef __x86_64__
	pushl IVP
	pushl LEN
	pushl KEYP
	pushl KLEN
2626 2627 2628 2629 2630
	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
	movl (FRAME_OFFSET+28)(%esp), INP	# src
	movl (FRAME_OFFSET+32)(%esp), LEN	# len
	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2631
#endif
2632
	cmp $16, LEN
2633
	jb .Lcbc_dec_just_ret
2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644
	mov 480(KEYP), KLEN
	add $240, KEYP
	movups (IVP), IV
	cmp $64, LEN
	jb .Lcbc_dec_loop1
.align 4
.Lcbc_dec_loop4:
	movups (INP), IN1
	movaps IN1, STATE1
	movups 0x10(INP), IN2
	movaps IN2, STATE2
2645
#ifdef __x86_64__
2646 2647 2648 2649
	movups 0x20(INP), IN3
	movaps IN3, STATE3
	movups 0x30(INP), IN4
	movaps IN4, STATE4
2650 2651 2652 2653 2654 2655
#else
	movups 0x20(INP), IN1
	movaps IN1, STATE3
	movups 0x30(INP), IN2
	movaps IN2, STATE4
#endif
2656 2657
	call _aesni_dec4
	pxor IV, STATE1
2658
#ifdef __x86_64__
2659 2660 2661 2662
	pxor IN1, STATE2
	pxor IN2, STATE3
	pxor IN3, STATE4
	movaps IN4, IV
2663 2664 2665
#else
	pxor IN1, STATE4
	movaps IN2, IV
2666 2667 2668 2669
	movups (INP), IN1
	pxor IN1, STATE2
	movups 0x10(INP), IN2
	pxor IN2, STATE3
2670
#endif
2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695
	movups STATE1, (OUTP)
	movups STATE2, 0x10(OUTP)
	movups STATE3, 0x20(OUTP)
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lcbc_dec_loop4
	cmp $16, LEN
	jb .Lcbc_dec_ret
.align 4
.Lcbc_dec_loop1:
	movups (INP), IN
	movaps IN, STATE
	call _aesni_dec1
	pxor IV, STATE
	movups STATE, (OUTP)
	movaps IN, IV
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lcbc_dec_loop1
.Lcbc_dec_ret:
2696 2697
	movups IV, (IVP)
.Lcbc_dec_just_ret:
2698 2699 2700 2701 2702 2703
#ifndef __x86_64__
	popl KLEN
	popl KEYP
	popl LEN
	popl IVP
#endif
2704
	FRAME_END
2705
	ret
2706
ENDPROC(aesni_cbc_dec)
2707

2708
#ifdef __x86_64__
2709
.pushsection .rodata
2710 2711 2712
.align 16
.Lbswap_mask:
	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2713
.popsection
2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725

/*
 * _aesni_inc_init:	internal ABI
 *	setup registers used by _aesni_inc
 * input:
 *	IV
 * output:
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 */
2726
.align 4
2727 2728 2729 2730 2731
_aesni_inc_init:
	movaps .Lbswap_mask, BSWAP_MASK
	movaps IV, CTR
	PSHUFB_XMM BSWAP_MASK CTR
	mov $1, TCTR_LOW
2732 2733
	MOVQ_R64_XMM TCTR_LOW INC
	MOVQ_R64_XMM CTR TCTR_LOW
2734
	ret
2735
ENDPROC(_aesni_inc_init)
2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751

/*
 * _aesni_inc:		internal ABI
 *	Increase IV by 1, IV is in big endian
 * input:
 *	IV
 *	CTR:	== IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 *	INC:	== 1, in little endian
 *	BSWAP_MASK == endian swapping mask
 * output:
 *	IV:	Increase by 1
 * changed:
 *	CTR:	== output IV, in little endian
 *	TCTR_LOW: == lower qword of CTR
 */
2752
.align 4
2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763
_aesni_inc:
	paddq INC, CTR
	add $1, TCTR_LOW
	jnc .Linc_low
	pslldq $8, INC
	paddq INC, CTR
	psrldq $8, INC
.Linc_low:
	movaps CTR, IV
	PSHUFB_XMM BSWAP_MASK IV
	ret
2764
ENDPROC(_aesni_inc)
2765 2766 2767 2768 2769 2770

/*
 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *		      size_t len, u8 *iv)
 */
ENTRY(aesni_ctr_enc)
2771
	FRAME_BEGIN
2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824
	cmp $16, LEN
	jb .Lctr_enc_just_ret
	mov 480(KEYP), KLEN
	movups (IVP), IV
	call _aesni_inc_init
	cmp $64, LEN
	jb .Lctr_enc_loop1
.align 4
.Lctr_enc_loop4:
	movaps IV, STATE1
	call _aesni_inc
	movups (INP), IN1
	movaps IV, STATE2
	call _aesni_inc
	movups 0x10(INP), IN2
	movaps IV, STATE3
	call _aesni_inc
	movups 0x20(INP), IN3
	movaps IV, STATE4
	call _aesni_inc
	movups 0x30(INP), IN4
	call _aesni_enc4
	pxor IN1, STATE1
	movups STATE1, (OUTP)
	pxor IN2, STATE2
	movups STATE2, 0x10(OUTP)
	pxor IN3, STATE3
	movups STATE3, 0x20(OUTP)
	pxor IN4, STATE4
	movups STATE4, 0x30(OUTP)
	sub $64, LEN
	add $64, INP
	add $64, OUTP
	cmp $64, LEN
	jge .Lctr_enc_loop4
	cmp $16, LEN
	jb .Lctr_enc_ret
.align 4
.Lctr_enc_loop1:
	movaps IV, STATE
	call _aesni_inc
	movups (INP), IN
	call _aesni_enc1
	pxor IN, STATE
	movups STATE, (OUTP)
	sub $16, LEN
	add $16, INP
	add $16, OUTP
	cmp $16, LEN
	jge .Lctr_enc_loop1
.Lctr_enc_ret:
	movups IV, (IVP)
.Lctr_enc_just_ret:
2825
	FRAME_END
2826
	ret
2827
ENDPROC(aesni_ctr_enc)
2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851

/*
 * _aesni_gf128mul_x_ble:		internal ABI
 *	Multiply in GF(2^128) for XTS IVs
 * input:
 *	IV:	current IV
 *	GF128MUL_MASK == mask with 0x87 and 0x01
 * output:
 *	IV:	next IV
 * changed:
 *	CTR:	== temporary value
 */
#define _aesni_gf128mul_x_ble() \
	pshufd $0x13, IV, CTR; \
	paddq IV, IV; \
	psrad $31, CTR; \
	pand GF128MUL_MASK, CTR; \
	pxor CTR, IV;

/*
 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
 *			 bool enc, u8 *iv)
 */
ENTRY(aesni_xts_crypt8)
2852
	FRAME_BEGIN
2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867
	cmpb $0, %cl
	movl $0, %ecx
	movl $240, %r10d
	leaq _aesni_enc4, %r11
	leaq _aesni_dec4, %rax
	cmovel %r10d, %ecx
	cmoveq %rax, %r11

	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
	movups (IVP), IV

	mov 480(KEYP), KLEN
	addq %rcx, KEYP

	movdqa IV, STATE1
2868 2869
	movdqu 0x00(INP), INC
	pxor INC, STATE1
2870 2871 2872 2873
	movdqu IV, 0x00(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE2
2874 2875
	movdqu 0x10(INP), INC
	pxor INC, STATE2
2876 2877 2878 2879
	movdqu IV, 0x10(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE3
2880 2881
	movdqu 0x20(INP), INC
	pxor INC, STATE3
2882 2883 2884 2885
	movdqu IV, 0x20(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE4
2886 2887
	movdqu 0x30(INP), INC
	pxor INC, STATE4
2888 2889 2890 2891
	movdqu IV, 0x30(OUTP)

	call *%r11

2892 2893
	movdqu 0x00(OUTP), INC
	pxor INC, STATE1
2894 2895 2896 2897
	movdqu STATE1, 0x00(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE1
2898 2899
	movdqu 0x40(INP), INC
	pxor INC, STATE1
2900 2901
	movdqu IV, 0x40(OUTP)

2902 2903
	movdqu 0x10(OUTP), INC
	pxor INC, STATE2
2904 2905 2906 2907
	movdqu STATE2, 0x10(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE2
2908 2909
	movdqu 0x50(INP), INC
	pxor INC, STATE2
2910 2911
	movdqu IV, 0x50(OUTP)

2912 2913
	movdqu 0x20(OUTP), INC
	pxor INC, STATE3
2914 2915 2916 2917
	movdqu STATE3, 0x20(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE3
2918 2919
	movdqu 0x60(INP), INC
	pxor INC, STATE3
2920 2921
	movdqu IV, 0x60(OUTP)

2922 2923
	movdqu 0x30(OUTP), INC
	pxor INC, STATE4
2924 2925 2926 2927
	movdqu STATE4, 0x30(OUTP)

	_aesni_gf128mul_x_ble()
	movdqa IV, STATE4
2928 2929
	movdqu 0x70(INP), INC
	pxor INC, STATE4
2930 2931 2932 2933 2934 2935 2936
	movdqu IV, 0x70(OUTP)

	_aesni_gf128mul_x_ble()
	movups IV, (IVP)

	call *%r11

2937 2938
	movdqu 0x40(OUTP), INC
	pxor INC, STATE1
2939 2940
	movdqu STATE1, 0x40(OUTP)

2941 2942
	movdqu 0x50(OUTP), INC
	pxor INC, STATE2
2943 2944
	movdqu STATE2, 0x50(OUTP)

2945 2946
	movdqu 0x60(OUTP), INC
	pxor INC, STATE3
2947 2948
	movdqu STATE3, 0x60(OUTP)

2949 2950
	movdqu 0x70(OUTP), INC
	pxor INC, STATE4
2951 2952
	movdqu STATE4, 0x70(OUTP)

2953
	FRAME_END
2954 2955 2956
	ret
ENDPROC(aesni_xts_crypt8)

2957
#endif