/* * Implement AES algorithm in Intel AES-NI instructions. * * The white paper of AES-NI instructions can be downloaded from: * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying * Vinodh Gopal * Kahraman Akdemir * * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD * interface for 64-bit kernels. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Adrian Hoban * James Guilford (james.guilford@intel.com) * Gabriele Paoloni * Tadeusz Struk (tadeusz.struk@intel.com) * Wajdi Feghali (wajdi.k.feghali@intel.com) * Copyright (c) 2010, Intel Corporation. * * Ported x86_64 version to x86: * Author: Mathias Krause * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include #include #include /* * The following macros are used to move an (un)aligned 16 byte value to/from * an XMM register. This can done for either FP or integer values, for FP use * movaps (move aligned packed single) or integer use movdqa (move double quad * aligned). It doesn't make a performance difference which instruction is used * since Nehalem (original Core i7) was released. However, the movaps is a byte * shorter, so that is the one we'll use for now. (same for unaligned). */ #define MOVADQ movaps #define MOVUDQ movups #ifdef __x86_64__ # constants in mergeable sections, linker can reorder and merge .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 .align 16 .Lgf128mul_x_ble_mask: .octa 0x00000000000000010000000000000087 .section .rodata.cst16.POLY, "aM", @progbits, 16 .align 16 POLY: .octa 0xC2000000000000000000000000000001 .section .rodata.cst16.TWOONE, "aM", @progbits, 16 .align 16 TWOONE: .octa 0x00000001000000000000000000000001 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 .align 16 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F .section .rodata.cst16.MASK1, "aM", @progbits, 16 .align 16 MASK1: .octa 0x0000000000000000ffffffffffffffff .section .rodata.cst16.MASK2, "aM", @progbits, 16 .align 16 MASK2: .octa 0xffffffffffffffff0000000000000000 .section .rodata.cst16.ONE, "aM", @progbits, 16 .align 16 ONE: .octa 0x00000000000000000000000000000001 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 .align 16 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 .section .rodata.cst16.dec, "aM", @progbits, 16 .align 16 dec: .octa 0x1 .section .rodata.cst16.enc, "aM", @progbits, 16 .align 16 enc: .octa 0x2 # order of these constants should not change. # more specifically, ALL_F should follow SHIFT_MASK, # and zero should follow ALL_F .section .rodata, "a", @progbits .align 16 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 .section .rodata .align 16 .type aad_shift_arr, @object .size aad_shift_arr, 272 aad_shift_arr: .octa 0xffffffffffffffffffffffffffffffff .octa 0xffffffffffffffffffffffffffffff0C .octa 0xffffffffffffffffffffffffffff0D0C .octa 0xffffffffffffffffffffffffff0E0D0C .octa 0xffffffffffffffffffffffff0F0E0D0C .octa 0xffffffffffffffffffffff0C0B0A0908 .octa 0xffffffffffffffffffff0D0C0B0A0908 .octa 0xffffffffffffffffff0E0D0C0B0A0908 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 .octa 0xffffffffffffff0C0B0A090807060504 .octa 0xffffffffffff0D0C0B0A090807060504 .octa 0xffffffffff0E0D0C0B0A090807060504 .octa 0xffffffff0F0E0D0C0B0A090807060504 .octa 0xffffff0C0B0A09080706050403020100 .octa 0xffff0D0C0B0A09080706050403020100 .octa 0xff0E0D0C0B0A09080706050403020100 .octa 0x0F0E0D0C0B0A09080706050403020100 .text #define STACK_OFFSET 8*3 #define HashKey 16*0 // store HashKey <<1 mod poly here #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 // bits of HashKey <<1 mod poly here //(for Karatsuba purposes) #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 // bits of HashKey^2 <<1 mod poly here // (for Karatsuba purposes) #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 // bits of HashKey^3 <<1 mod poly here // (for Karatsuba purposes) #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 // bits of HashKey^4 <<1 mod poly here // (for Karatsuba purposes) #define VARIABLE_OFFSET 16*8 #define arg1 rdi #define arg2 rsi #define arg3 rdx #define arg4 rcx #define arg5 r8 #define arg6 r9 #define arg7 STACK_OFFSET+8(%r14) #define arg8 STACK_OFFSET+16(%r14) #define arg9 STACK_OFFSET+24(%r14) #define arg10 STACK_OFFSET+32(%r14) #define keysize 2*15*16(%arg1) #endif #define STATE1 %xmm0 #define STATE2 %xmm4 #define STATE3 %xmm5 #define STATE4 %xmm6 #define STATE STATE1 #define IN1 %xmm1 #define IN2 %xmm7 #define IN3 %xmm8 #define IN4 %xmm9 #define IN IN1 #define KEY %xmm2 #define IV %xmm3 #define BSWAP_MASK %xmm10 #define CTR %xmm11 #define INC %xmm12 #define GF128MUL_MASK %xmm10 #ifdef __x86_64__ #define AREG %rax #define KEYP %rdi #define OUTP %rsi #define UKEYP OUTP #define INP %rdx #define LEN %rcx #define IVP %r8 #define KLEN %r9d #define T1 %r10 #define TKEYP T1 #define T2 %r11 #define TCTR_LOW T2 #else #define AREG %eax #define KEYP %edi #define OUTP AREG #define UKEYP OUTP #define INP %edx #define LEN %esi #define IVP %ebp #define KLEN %ebx #define T1 %ecx #define TKEYP T1 #endif #ifdef __x86_64__ /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) * * * Input: A and B (128-bits each, bit-reflected) * Output: C = A*B*x mod poly, (i.e. >>1 ) * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. * */ .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 movdqa \GH, \TMP1 pshufd $78, \GH, \TMP2 pshufd $78, \HK, \TMP3 pxor \GH, \TMP2 # TMP2 = a1+a0 pxor \HK, \TMP3 # TMP3 = b1+b0 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) pxor \GH, \TMP2 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) movdqa \TMP2, \TMP3 pslldq $8, \TMP3 # left shift TMP3 2 DWs psrldq $8, \TMP2 # right shift TMP2 2 DWs pxor \TMP3, \GH pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK # first phase of the reduction movdqa \GH, \TMP2 movdqa \GH, \TMP3 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 # in in order to perform # independent shifts pslld $31, \TMP2 # packed right shift <<31 pslld $30, \TMP3 # packed right shift <<30 pslld $25, \TMP4 # packed right shift <<25 pxor \TMP3, \TMP2 # xor the shifted versions pxor \TMP4, \TMP2 movdqa \TMP2, \TMP5 psrldq $4, \TMP5 # right shift TMP5 1 DW pslldq $12, \TMP2 # left shift TMP2 3 DWs pxor \TMP2, \GH # second phase of the reduction movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 # in in order to perform # independent shifts movdqa \GH,\TMP3 movdqa \GH,\TMP4 psrld $1,\TMP2 # packed left shift >>1 psrld $2,\TMP3 # packed left shift >>2 psrld $7,\TMP4 # packed left shift >>7 pxor \TMP3,\TMP2 # xor the shifted versions pxor \TMP4,\TMP2 pxor \TMP5, \TMP2 pxor \TMP2, \GH pxor \TMP1, \GH # result is in TMP1 .endm # Reads DLEN bytes starting at DPTR and stores in XMMDst # where 0 < DLEN < 16 # Clobbers %rax, DLEN and XMM1 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst cmp $8, \DLEN jl _read_lt8_\@ mov (\DPTR), %rax MOVQ_R64_XMM %rax, \XMMDst sub $8, \DLEN jz _done_read_partial_block_\@ xor %eax, %eax _read_next_byte_\@: shl $8, %rax mov 7(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_\@ MOVQ_R64_XMM %rax, \XMM1 pslldq $8, \XMM1 por \XMM1, \XMMDst jmp _done_read_partial_block_\@ _read_lt8_\@: xor %eax, %eax _read_next_byte_lt8_\@: shl $8, %rax mov -1(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_lt8_\@ MOVQ_R64_XMM %rax, \XMMDst _done_read_partial_block_\@: .endm /* * if a = number of total plaintext bytes * b = floor(a/16) * num_initial_blocks = b mod 4 * encrypt the initial num_initial_blocks blocks and apply ghash on * the ciphertext * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers * are clobbered * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified */ .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation MOVADQ SHUF_MASK(%rip), %xmm14 mov arg7, %r10 # %r10 = AAD mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i pxor \XMM2, \XMM2 cmp $16, %r11 jl _get_AAD_rest8\num_initial_blocks\operation _get_AAD_blocks\num_initial_blocks\operation: movdqu (%r10), %xmm\i PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor %xmm\i, \XMM2 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 add $16, %r10 sub $16, %r12 sub $16, %r11 cmp $16, %r11 jge _get_AAD_blocks\num_initial_blocks\operation movdqu \XMM2, %xmm\i cmp $0, %r11 je _get_AAD_done\num_initial_blocks\operation pxor %xmm\i,%xmm\i /* read the last <16B of AAD. since we have at least 4B of data right after the AAD (the ICV, and maybe some CT), we can read 4B/8B blocks safely, and then get rid of the extra stuff */ _get_AAD_rest8\num_initial_blocks\operation: cmp $4, %r11 jle _get_AAD_rest4\num_initial_blocks\operation movq (%r10), \TMP1 add $8, %r10 sub $8, %r11 pslldq $8, \TMP1 psrldq $8, %xmm\i pxor \TMP1, %xmm\i jmp _get_AAD_rest8\num_initial_blocks\operation _get_AAD_rest4\num_initial_blocks\operation: cmp $0, %r11 jle _get_AAD_rest0\num_initial_blocks\operation mov (%r10), %eax movq %rax, \TMP1 add $4, %r10 sub $4, %r10 pslldq $12, \TMP1 psrldq $4, %xmm\i pxor \TMP1, %xmm\i _get_AAD_rest0\num_initial_blocks\operation: /* finalize: shift out the extra bytes we read, and align left. since pslldq can only shift by an immediate, we use vpshufb and an array of shuffle masks */ movq %r12, %r11 salq $4, %r11 movdqu aad_shift_arr(%r11), \TMP1 PSHUFB_XMM \TMP1, %xmm\i _get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor \XMM2, %xmm\i GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 _get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 PSHUFB_XMM %xmm14, \XMM0 .if (\i == 5) || (\i == 6) || (\i == 7) MOVADQ ONE(%RIP),\TMP1 MOVADQ (%arg1),\TMP2 .irpc index, \i_seq paddd \TMP1, \XMM0 # INCR Y0 movdqa \XMM0, %xmm\index PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap pxor \TMP2, %xmm\index .endr lea 0x10(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 add $5,%eax # 128->9, 192->11, 256->13 aes_loop_initial_dec\num_initial_blocks: MOVADQ (%r10),\TMP1 .irpc index, \i_seq AESENC \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax jnz aes_loop_initial_dec\num_initial_blocks MOVADQ (%r10), \TMP1 .irpc index, \i_seq AESENCLAST \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq movdqu (%arg3 , %r11, 1), \TMP1 pxor \TMP1, %xmm\index movdqu %xmm\index, (%arg2 , %r11, 1) # write back plaintext/ciphertext for num_initial_blocks add $16, %r11 movdqa \TMP1, %xmm\index PSHUFB_XMM %xmm14, %xmm\index # prepare plaintext/ciphertext for GHASH computation .endr .endif # apply GHASH on num_initial_blocks blocks .if \i == 5 pxor %xmm5, %xmm6 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 pxor %xmm6, %xmm7 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 pxor %xmm7, %xmm8 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .elseif \i == 6 pxor %xmm6, %xmm7 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 pxor %xmm7, %xmm8 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .elseif \i == 7 pxor %xmm7, %xmm8 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .endif cmp $64, %r13 jl _initial_blocks_done\num_initial_blocks\operation # no need for precomputed values /* * * Precomputations for HashKey parallel with encryption of first 4 blocks. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i */ MOVADQ ONE(%rip), \TMP1 paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM1 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM2 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM3 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM4 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap MOVADQ 0(%arg1),\TMP1 pxor \TMP1, \XMM1 pxor \TMP1, \XMM2 pxor \TMP1, \XMM3 pxor \TMP1, \XMM4 movdqa \TMP3, \TMP5 pshufd $78, \TMP3, \TMP1 pxor \TMP3, \TMP1 movdqa \TMP1, HashKey_k(%rsp) GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^2<<1 (mod poly) movdqa \TMP5, HashKey_2(%rsp) # HashKey_2 = HashKey^2<<1 (mod poly) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_2_k(%rsp) .irpc index, 1234 # do 4 rounds movaps 0x10*\index(%arg1), \TMP1 AESENC \TMP1, \XMM1 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 .endr GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^3<<1 (mod poly) movdqa \TMP5, HashKey_3(%rsp) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_3_k(%rsp) .irpc index, 56789 # do next 5 rounds movaps 0x10*\index(%arg1), \TMP1 AESENC \TMP1, \XMM1 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 .endr GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^3<<1 (mod poly) movdqa \TMP5, HashKey_4(%rsp) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_4_k(%rsp) lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 jz aes_loop_pre_dec_done\num_initial_blocks aes_loop_pre_dec\num_initial_blocks: MOVADQ (%r10),\TMP2 .irpc index, 1234 AESENC \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax jnz aes_loop_pre_dec\num_initial_blocks aes_loop_pre_dec_done\num_initial_blocks: MOVADQ (%r10), \TMP2 AESENCLAST \TMP2, \XMM1 AESENCLAST \TMP2, \XMM2 AESENCLAST \TMP2, \XMM3 AESENCLAST \TMP2, \XMM4 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM1 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) movdqa \TMP1, \XMM1 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM2 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) movdqa \TMP1, \XMM2 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM3 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) movdqa \TMP1, \XMM3 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM4 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) movdqa \TMP1, \XMM4 add $64, %r11 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap _initial_blocks_done\num_initial_blocks\operation: .endm /* * if a = number of total plaintext bytes * b = floor(a/16) * num_initial_blocks = b mod 4 * encrypt the initial num_initial_blocks blocks and apply ghash on * the ciphertext * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers * are clobbered * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified */ .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation MOVADQ SHUF_MASK(%rip), %xmm14 mov arg7, %r10 # %r10 = AAD mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i pxor \XMM2, \XMM2 cmp $16, %r11 jl _get_AAD_rest8\num_initial_blocks\operation _get_AAD_blocks\num_initial_blocks\operation: movdqu (%r10), %xmm\i PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor %xmm\i, \XMM2 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 add $16, %r10 sub $16, %r12 sub $16, %r11 cmp $16, %r11 jge _get_AAD_blocks\num_initial_blocks\operation movdqu \XMM2, %xmm\i cmp $0, %r11 je _get_AAD_done\num_initial_blocks\operation pxor %xmm\i,%xmm\i /* read the last <16B of AAD. since we have at least 4B of data right after the AAD (the ICV, and maybe some PT), we can read 4B/8B blocks safely, and then get rid of the extra stuff */ _get_AAD_rest8\num_initial_blocks\operation: cmp $4, %r11 jle _get_AAD_rest4\num_initial_blocks\operation movq (%r10), \TMP1 add $8, %r10 sub $8, %r11 pslldq $8, \TMP1 psrldq $8, %xmm\i pxor \TMP1, %xmm\i jmp _get_AAD_rest8\num_initial_blocks\operation _get_AAD_rest4\num_initial_blocks\operation: cmp $0, %r11 jle _get_AAD_rest0\num_initial_blocks\operation mov (%r10), %eax movq %rax, \TMP1 add $4, %r10 sub $4, %r10 pslldq $12, \TMP1 psrldq $4, %xmm\i pxor \TMP1, %xmm\i _get_AAD_rest0\num_initial_blocks\operation: /* finalize: shift out the extra bytes we read, and align left. since pslldq can only shift by an immediate, we use vpshufb and an array of shuffle masks */ movq %r12, %r11 salq $4, %r11 movdqu aad_shift_arr(%r11), \TMP1 PSHUFB_XMM \TMP1, %xmm\i _get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor \XMM2, %xmm\i GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 _get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 PSHUFB_XMM %xmm14, \XMM0 .if (\i == 5) || (\i == 6) || (\i == 7) MOVADQ ONE(%RIP),\TMP1 MOVADQ 0(%arg1),\TMP2 .irpc index, \i_seq paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, %xmm\index PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap pxor \TMP2, %xmm\index .endr lea 0x10(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 add $5,%eax # 128->9, 192->11, 256->13 aes_loop_initial_enc\num_initial_blocks: MOVADQ (%r10),\TMP1 .irpc index, \i_seq AESENC \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax jnz aes_loop_initial_enc\num_initial_blocks MOVADQ (%r10), \TMP1 .irpc index, \i_seq AESENCLAST \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq movdqu (%arg3 , %r11, 1), \TMP1 pxor \TMP1, %xmm\index movdqu %xmm\index, (%arg2 , %r11, 1) # write back plaintext/ciphertext for num_initial_blocks add $16, %r11 PSHUFB_XMM %xmm14, %xmm\index # prepare plaintext/ciphertext for GHASH computation .endr .endif # apply GHASH on num_initial_blocks blocks .if \i == 5 pxor %xmm5, %xmm6 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 pxor %xmm6, %xmm7 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 pxor %xmm7, %xmm8 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .elseif \i == 6 pxor %xmm6, %xmm7 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 pxor %xmm7, %xmm8 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .elseif \i == 7 pxor %xmm7, %xmm8 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .endif cmp $64, %r13 jl _initial_blocks_done\num_initial_blocks\operation # no need for precomputed values /* * * Precomputations for HashKey parallel with encryption of first 4 blocks. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i */ MOVADQ ONE(%RIP),\TMP1 paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM1 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM2 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM3 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM4 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap MOVADQ 0(%arg1),\TMP1 pxor \TMP1, \XMM1 pxor \TMP1, \XMM2 pxor \TMP1, \XMM3 pxor \TMP1, \XMM4 movdqa \TMP3, \TMP5 pshufd $78, \TMP3, \TMP1 pxor \TMP3, \TMP1 movdqa \TMP1, HashKey_k(%rsp) GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^2<<1 (mod poly) movdqa \TMP5, HashKey_2(%rsp) # HashKey_2 = HashKey^2<<1 (mod poly) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_2_k(%rsp) .irpc index, 1234 # do 4 rounds movaps 0x10*\index(%arg1), \TMP1 AESENC \TMP1, \XMM1 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 .endr GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^3<<1 (mod poly) movdqa \TMP5, HashKey_3(%rsp) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_3_k(%rsp) .irpc index, 56789 # do next 5 rounds movaps 0x10*\index(%arg1), \TMP1 AESENC \TMP1, \XMM1 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 .endr GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 # TMP5 = HashKey^3<<1 (mod poly) movdqa \TMP5, HashKey_4(%rsp) pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_4_k(%rsp) lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 jz aes_loop_pre_enc_done\num_initial_blocks aes_loop_pre_enc\num_initial_blocks: MOVADQ (%r10),\TMP2 .irpc index, 1234 AESENC \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax jnz aes_loop_pre_enc\num_initial_blocks aes_loop_pre_enc_done\num_initial_blocks: MOVADQ (%r10), \TMP2 AESENCLAST \TMP2, \XMM1 AESENCLAST \TMP2, \XMM2 AESENCLAST \TMP2, \XMM3 AESENCLAST \TMP2, \XMM4 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM1 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM2 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM3 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM4 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) movdqu \XMM2, 16*1(%arg2 , %r11 , 1) movdqu \XMM3, 16*2(%arg2 , %r11 , 1) movdqu \XMM4, 16*3(%arg2 , %r11 , 1) add $64, %r11 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap _initial_blocks_done\num_initial_blocks\operation: .endm /* * encrypt 4 blocks at a time * ghash the 4 previously encrypted ciphertext blocks * arg1, %arg2, %arg3 are used as pointers only, not modified * %r11 is the data offset value */ .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 movdqa \XMM2, \XMM6 movdqa \XMM3, \XMM7 movdqa \XMM4, \XMM8 movdqa SHUF_MASK(%rip), %xmm15 # multiply TMP5 * HashKey using karatsuba movdqa \XMM5, \TMP4 pshufd $78, \XMM5, \TMP6 pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqa HashKey_4(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqa HashKey_4_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 1 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 2 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqa HashKey_3(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 3 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 4 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 movdqa HashKey_3_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 5 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 pxor \TMP2, \TMP6 movdqa \XMM7, \TMP1 pshufd $78, \XMM7, \TMP2 pxor \XMM7, \TMP2 movdqa HashKey_2(%rsp ), \TMP5 # Multiply TMP5 * HashKey using karatsuba PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 6 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 7 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 movdqa HashKey_2_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 8 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 pxor \TMP2, \TMP6 # Multiply XMM8 * HashKey # XMM8 and TMP5 hold the values for the two operands movdqa \XMM8, \TMP1 pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqa HashKey(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 9 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 jz aes_loop_par_enc_done aes_loop_par_enc: MOVADQ (%r10),\TMP3 .irpc index, 1234 AESENC \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax jnz aes_loop_par_enc aes_loop_par_enc_done: MOVADQ (%r10), \TMP3 AESENCLAST \TMP3, \XMM1 # Round 10 AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM4 movdqa HashKey_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu 16(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK movdqu 32(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK movdqu 48(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 pxor \TMP6, \TMP2 pxor \TMP1, \TMP2 pxor \XMM5, \TMP2 movdqa \TMP2, \TMP3 pslldq $8, \TMP3 # left shift TMP3 2 DWs psrldq $8, \TMP2 # right shift TMP2 2 DWs pxor \TMP3, \XMM5 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 # first phase of reduction movdqa \XMM5, \TMP2 movdqa \XMM5, \TMP3 movdqa \XMM5, \TMP4 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently pslld $31, \TMP2 # packed right shift << 31 pslld $30, \TMP3 # packed right shift << 30 pslld $25, \TMP4 # packed right shift << 25 pxor \TMP3, \TMP2 # xor the shifted versions pxor \TMP4, \TMP2 movdqa \TMP2, \TMP5 psrldq $4, \TMP5 # right shift T5 1 DW pslldq $12, \TMP2 # left shift T2 3 DWs pxor \TMP2, \XMM5 # second phase of reduction movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 movdqa \XMM5,\TMP3 movdqa \XMM5,\TMP4 psrld $1, \TMP2 # packed left shift >>1 psrld $2, \TMP3 # packed left shift >>2 psrld $7, \TMP4 # packed left shift >>7 pxor \TMP3,\TMP2 # xor the shifted versions pxor \TMP4,\TMP2 pxor \TMP5, \TMP2 pxor \TMP2, \XMM5 pxor \TMP1, \XMM5 # result is in TMP1 pxor \XMM5, \XMM1 .endm /* * decrypt 4 blocks at a time * ghash the 4 previously decrypted ciphertext blocks * arg1, %arg2, %arg3 are used as pointers only, not modified * %r11 is the data offset value */ .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 movdqa \XMM2, \XMM6 movdqa \XMM3, \XMM7 movdqa \XMM4, \XMM8 movdqa SHUF_MASK(%rip), %xmm15 # multiply TMP5 * HashKey using karatsuba movdqa \XMM5, \TMP4 pshufd $78, \XMM5, \TMP6 pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqa HashKey_4(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqa HashKey_4_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 1 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 2 AESENC \TMP1, \XMM2 AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqa HashKey_3(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 3 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 4 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 movdqa HashKey_3_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 5 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 pxor \TMP2, \TMP6 movdqa \XMM7, \TMP1 pshufd $78, \XMM7, \TMP2 pxor \XMM7, \TMP2 movdqa HashKey_2(%rsp ), \TMP5 # Multiply TMP5 * HashKey using karatsuba PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 6 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 7 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 movdqa HashKey_2_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 8 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 pxor \TMP2, \TMP6 # Multiply XMM8 * HashKey # XMM8 and TMP5 hold the values for the two operands movdqa \XMM8, \TMP1 pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqa HashKey(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 9 AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 jz aes_loop_par_dec_done aes_loop_par_dec: MOVADQ (%r10),\TMP3 .irpc index, 1234 AESENC \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax jnz aes_loop_par_dec aes_loop_par_dec_done: MOVADQ (%r10), \TMP3 AESENCLAST \TMP3, \XMM1 # last round AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM4 movdqa HashKey_k(%rsp), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM1 movdqu 16(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM2 movdqu 32(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM3 movdqu 48(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM4 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 pxor \TMP6, \TMP2 pxor \TMP1, \TMP2 pxor \XMM5, \TMP2 movdqa \TMP2, \TMP3 pslldq $8, \TMP3 # left shift TMP3 2 DWs psrldq $8, \TMP2 # right shift TMP2 2 DWs pxor \TMP3, \XMM5 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 # first phase of reduction movdqa \XMM5, \TMP2 movdqa \XMM5, \TMP3 movdqa \XMM5, \TMP4 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently pslld $31, \TMP2 # packed right shift << 31 pslld $30, \TMP3 # packed right shift << 30 pslld $25, \TMP4 # packed right shift << 25 pxor \TMP3, \TMP2 # xor the shifted versions pxor \TMP4, \TMP2 movdqa \TMP2, \TMP5 psrldq $4, \TMP5 # right shift T5 1 DW pslldq $12, \TMP2 # left shift T2 3 DWs pxor \TMP2, \XMM5 # second phase of reduction movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 movdqa \XMM5,\TMP3 movdqa \XMM5,\TMP4 psrld $1, \TMP2 # packed left shift >>1 psrld $2, \TMP3 # packed left shift >>2 psrld $7, \TMP4 # packed left shift >>7 pxor \TMP3,\TMP2 # xor the shifted versions pxor \TMP4,\TMP2 pxor \TMP5, \TMP2 pxor \TMP2, \XMM5 pxor \TMP1, \XMM5 # result is in TMP1 pxor \XMM5, \XMM1 .endm /* GHASH the last 4 ciphertext blocks. */ .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst # Multiply TMP6 * HashKey (using Karatsuba) movdqa \XMM1, \TMP6 pshufd $78, \XMM1, \TMP2 pxor \XMM1, \TMP2 movdqa HashKey_4(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 movdqa HashKey_4_k(%rsp), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqa \XMM1, \XMMDst movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 # Multiply TMP1 * HashKey (using Karatsuba) movdqa \XMM2, \TMP1 pshufd $78, \XMM2, \TMP2 pxor \XMM2, \TMP2 movdqa HashKey_3(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 movdqa HashKey_3_k(%rsp), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM2, \XMMDst pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 # Multiply TMP1 * HashKey (using Karatsuba) movdqa \XMM3, \TMP1 pshufd $78, \XMM3, \TMP2 pxor \XMM3, \TMP2 movdqa HashKey_2(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 movdqa HashKey_2_k(%rsp), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM3, \XMMDst pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 # Multiply TMP1 * HashKey (using Karatsuba) movdqa \XMM4, \TMP1 pshufd $78, \XMM4, \TMP2 pxor \XMM4, \TMP2 movdqa HashKey(%rsp), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 movdqa HashKey_k(%rsp), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM4, \XMMDst pxor \XMM1, \TMP2 pxor \TMP6, \TMP2 pxor \XMMDst, \TMP2 # middle section of the temp results combined as in karatsuba algorithm movdqa \TMP2, \TMP4 pslldq $8, \TMP4 # left shift TMP4 2 DWs psrldq $8, \TMP2 # right shift TMP2 2 DWs pxor \TMP4, \XMMDst pxor \TMP2, \TMP6 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications # first phase of the reduction movdqa \XMMDst, \TMP2 movdqa \XMMDst, \TMP3 movdqa \XMMDst, \TMP4 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently pslld $31, \TMP2 # packed right shifting << 31 pslld $30, \TMP3 # packed right shifting << 30 pslld $25, \TMP4 # packed right shifting << 25 pxor \TMP3, \TMP2 # xor the shifted versions pxor \TMP4, \TMP2 movdqa \TMP2, \TMP7 psrldq $4, \TMP7 # right shift TMP7 1 DW pslldq $12, \TMP2 # left shift TMP2 3 DWs pxor \TMP2, \XMMDst # second phase of the reduction movdqa \XMMDst, \TMP2 # make 3 copies of XMMDst for doing 3 shift operations movdqa \XMMDst, \TMP3 movdqa \XMMDst, \TMP4 psrld $1, \TMP2 # packed left shift >> 1 psrld $2, \TMP3 # packed left shift >> 2 psrld $7, \TMP4 # packed left shift >> 7 pxor \TMP3, \TMP2 # xor the shifted versions pxor \TMP4, \TMP2 pxor \TMP7, \TMP2 pxor \TMP2, \XMMDst pxor \TMP6, \XMMDst # reduced result is in XMMDst .endm /* Encryption of a single block * uses eax & r10 */ .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 pxor (%arg1), \XMM0 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 add $5,%eax # 128->9, 192->11, 256->13 lea 16(%arg1), %r10 # get first expanded key address _esb_loop_\@: MOVADQ (%r10),\TMP1 AESENC \TMP1,\XMM0 add $16,%r10 sub $1,%eax jnz _esb_loop_\@ MOVADQ (%r10),\TMP1 AESENCLAST \TMP1,\XMM0 .endm /***************************************************************************** * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. * u8 *out, // Plaintext output. Encrypt in-place is allowed. * const u8 *in, // Ciphertext input * u64 plaintext_len, // Length of data in bytes for decryption. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) * // concatenated with 0x00000001. 16-byte aligned pointer. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, // Additional Authentication Data (AAD) * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the * // given authentication tag and only return the plaintext if they match. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 * // (most likely), 12 or 8. * * Assumptions: * * keys: * keys are pre-expanded and aligned to 16 bytes. we are using the first * set of 11 keys in the data structure void *aes_ctx * * iv: * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Salt (From the SA) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initialization Vector | * | (This is the sequence number from IPSec header) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 0x1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * * AAD: * AAD padded to 128 bits with 0 * for example, assume AAD is a u32 vector * * if AAD is 8 bytes: * AAD[3] = {A0, A1}; * padded AAD in xmm register = {A1 A0 0 0} * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | SPI (A1) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 32-bit Sequence Number (A0) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 0x0 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * AAD Format with 32-bit Sequence Number * * if AAD is 12 bytes: * AAD[3] = {A0, A1, A2}; * padded AAD in xmm register = {A2 A1 A0 0} * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | SPI (A2) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 64-bit Extended Sequence Number {A1,A0} | * | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 0x0 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * AAD Format with 64-bit Extended Sequence Number * * poly = x^128 + x^127 + x^126 + x^121 + 1 * *****************************************************************************/ ENTRY(aesni_gcm_dec) push %r12 push %r13 push %r14 mov %rsp, %r14 /* * states of %xmm registers %xmm6:%xmm15 not saved * all %xmm registers are clobbered */ sub $VARIABLE_OFFSET, %rsp and $~63, %rsp # align rsp to 64 bytes mov %arg6, %r12 movdqu (%r12), %xmm13 # %xmm13 = HashKey movdqa SHUF_MASK(%rip), %xmm2 PSHUFB_XMM %xmm2, %xmm13 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) movdqa %xmm13, %xmm2 psllq $1, %xmm13 psrlq $63, %xmm2 movdqa %xmm2, %xmm1 pslldq $8, %xmm2 psrldq $8, %xmm1 por %xmm2, %xmm13 # Reduction pshufd $0x24, %xmm1, %xmm2 pcmpeqd TWOONE(%rip), %xmm2 pand POLY(%rip), %xmm2 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) # Decrypt first few blocks movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) mov %r13, %r12 and $(3<<4), %r12 jz _initial_num_blocks_is_0_decrypt cmp $(2<<4), %r12 jb _initial_num_blocks_is_1_decrypt je _initial_num_blocks_is_2_decrypt _initial_num_blocks_is_3_decrypt: INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec sub $48, %r13 jmp _initial_blocks_decrypted _initial_num_blocks_is_2_decrypt: INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec sub $32, %r13 jmp _initial_blocks_decrypted _initial_num_blocks_is_1_decrypt: INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec sub $16, %r13 jmp _initial_blocks_decrypted _initial_num_blocks_is_0_decrypt: INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec _initial_blocks_decrypted: cmp $0, %r13 je _zero_cipher_left_decrypt sub $64, %r13 je _four_cipher_left_decrypt _decrypt_by_4: GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec add $64, %r11 sub $64, %r13 jne _decrypt_by_4 _four_cipher_left_decrypt: GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 _zero_cipher_left_decrypt: mov %arg4, %r13 and $15, %r13 # %r13 = arg4 (mod 16) je _multiple_of_16_bytes_decrypt # Handle the last <16 byte block separately paddd ONE(%rip), %xmm0 # increment CNT to get Yn movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) lea (%arg3,%r11,1), %r10 mov %r13, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 lea ALL_F+16(%rip), %r12 sub %r13, %r12 movdqa %xmm1, %xmm2 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) movdqu (%r12), %xmm1 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 pand %xmm1, %xmm2 movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10 ,%xmm2 pxor %xmm2, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # output %r13 bytes MOVQ_R64_XMM %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_decrypt mov %rax, (%arg2 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 MOVQ_R64_XMM %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_decrypt: mov %al, (%arg2, %r11, 1) add $1, %r11 shr $8, %rax sub $1, %r13 jne _less_than_8_bytes_left_decrypt _multiple_of_16_bytes_decrypt: mov arg8, %r12 # %r13 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits movd %r12d, %xmm15 # len(A) in %xmm15 shl $3, %arg4 # len(C) in bits (*128) MOVQ_R64_XMM %arg4, %xmm1 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) pxor %xmm15, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm8 mov %arg5, %rax # %rax = *Y0 movdqu (%rax), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) pxor %xmm8, %xmm0 _return_T_decrypt: mov arg9, %r10 # %r10 = authTag mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_decrypt cmp $8, %r11 jl _T_4_decrypt _T_8_decrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 psrldq $8, %xmm0 cmp $0, %r11 je _return_T_done_decrypt _T_4_decrypt: movd %xmm0, %eax mov %eax, (%r10) add $4, %r10 sub $4, %r11 psrldq $4, %xmm0 cmp $0, %r11 je _return_T_done_decrypt _T_123_decrypt: movd %xmm0, %eax cmp $2, %r11 jl _T_1_decrypt mov %ax, (%r10) cmp $2, %r11 je _return_T_done_decrypt add $2, %r10 sar $16, %eax _T_1_decrypt: mov %al, (%r10) jmp _return_T_done_decrypt _T_16_decrypt: movdqu %xmm0, (%r10) _return_T_done_decrypt: mov %r14, %rsp pop %r14 pop %r13 pop %r12 ret ENDPROC(aesni_gcm_dec) /***************************************************************************** * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. * u8 *out, // Ciphertext output. Encrypt in-place is allowed. * const u8 *in, // Plaintext input * u64 plaintext_len, // Length of data in bytes for encryption. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) * // concatenated with 0x00000001. 16-byte aligned pointer. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, // Additional Authentication Data (AAD) * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes * u8 *auth_tag, // Authenticated Tag output. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), * // 12 or 8. * * Assumptions: * * keys: * keys are pre-expanded and aligned to 16 bytes. we are using the * first set of 11 keys in the data structure void *aes_ctx * * * iv: * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Salt (From the SA) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initialization Vector | * | (This is the sequence number from IPSec header) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 0x1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * * AAD: * AAD padded to 128 bits with 0 * for example, assume AAD is a u32 vector * * if AAD is 8 bytes: * AAD[3] = {A0, A1}; * padded AAD in xmm register = {A1 A0 0 0} * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | SPI (A1) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 32-bit Sequence Number (A0) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 0x0 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * AAD Format with 32-bit Sequence Number * * if AAD is 12 bytes: * AAD[3] = {A0, A1, A2}; * padded AAD in xmm register = {A2 A1 A0 0} * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | SPI (A2) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 64-bit Extended Sequence Number {A1,A0} | * | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | 0x0 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * AAD Format with 64-bit Extended Sequence Number * * poly = x^128 + x^127 + x^126 + x^121 + 1 ***************************************************************************/ ENTRY(aesni_gcm_enc) push %r12 push %r13 push %r14 mov %rsp, %r14 # # states of %xmm registers %xmm6:%xmm15 not saved # all %xmm registers are clobbered # sub $VARIABLE_OFFSET, %rsp and $~63, %rsp mov %arg6, %r12 movdqu (%r12), %xmm13 movdqa SHUF_MASK(%rip), %xmm2 PSHUFB_XMM %xmm2, %xmm13 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) movdqa %xmm13, %xmm2 psllq $1, %xmm13 psrlq $63, %xmm2 movdqa %xmm2, %xmm1 pslldq $8, %xmm2 psrldq $8, %xmm1 por %xmm2, %xmm13 # reduce HashKey<<1 pshufd $0x24, %xmm1, %xmm2 pcmpeqd TWOONE(%rip), %xmm2 pand POLY(%rip), %xmm2 pxor %xmm2, %xmm13 movdqa %xmm13, HashKey(%rsp) mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) and $-16, %r13 mov %r13, %r12 # Encrypt first few blocks and $(3<<4), %r12 jz _initial_num_blocks_is_0_encrypt cmp $(2<<4), %r12 jb _initial_num_blocks_is_1_encrypt je _initial_num_blocks_is_2_encrypt _initial_num_blocks_is_3_encrypt: INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc sub $48, %r13 jmp _initial_blocks_encrypted _initial_num_blocks_is_2_encrypt: INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc sub $32, %r13 jmp _initial_blocks_encrypted _initial_num_blocks_is_1_encrypt: INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc sub $16, %r13 jmp _initial_blocks_encrypted _initial_num_blocks_is_0_encrypt: INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc _initial_blocks_encrypted: # Main loop - Encrypt remaining blocks cmp $0, %r13 je _zero_cipher_left_encrypt sub $64, %r13 je _four_cipher_left_encrypt _encrypt_by_4_encrypt: GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc add $64, %r11 sub $64, %r13 jne _encrypt_by_4_encrypt _four_cipher_left_encrypt: GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 _zero_cipher_left_encrypt: mov %arg4, %r13 and $15, %r13 # %r13 = arg4 (mod 16) je _multiple_of_16_bytes_encrypt # Handle the last <16 Byte block separately paddd ONE(%rip), %xmm0 # INCR CNT to get Yn movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) lea (%arg3,%r11,1), %r10 mov %r13, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 lea ALL_F+16(%rip), %r12 sub %r13, %r12 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) movdqu (%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10,%xmm0 pxor %xmm0, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # GHASH computation for the last <16 byte block movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 # shuffle xmm0 back to output as ciphertext # Output %r13 bytes MOVQ_R64_XMM %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_encrypt mov %rax, (%arg2 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 MOVQ_R64_XMM %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_encrypt: mov %al, (%arg2, %r11, 1) add $1, %r11 shr $8, %rax sub $1, %r13 jne _less_than_8_bytes_left_encrypt _multiple_of_16_bytes_encrypt: mov arg8, %r12 # %r12 = addLen (number of bytes) shl $3, %r12 movd %r12d, %xmm15 # len(A) in %xmm15 shl $3, %arg4 # len(C) in bits (*128) MOVQ_R64_XMM %arg4, %xmm1 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) pxor %xmm15, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap mov %arg5, %rax # %rax = *Y0 movdqu (%rax), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) pxor %xmm8, %xmm0 _return_T_encrypt: mov arg9, %r10 # %r10 = authTag mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_encrypt cmp $8, %r11 jl _T_4_encrypt _T_8_encrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 psrldq $8, %xmm0 cmp $0, %r11 je _return_T_done_encrypt _T_4_encrypt: movd %xmm0, %eax mov %eax, (%r10) add $4, %r10 sub $4, %r11 psrldq $4, %xmm0 cmp $0, %r11 je _return_T_done_encrypt _T_123_encrypt: movd %xmm0, %eax cmp $2, %r11 jl _T_1_encrypt mov %ax, (%r10) cmp $2, %r11 je _return_T_done_encrypt add $2, %r10 sar $16, %eax _T_1_encrypt: mov %al, (%r10) jmp _return_T_done_encrypt _T_16_encrypt: movdqu %xmm0, (%r10) _return_T_done_encrypt: mov %r14, %rsp pop %r14 pop %r13 pop %r12 ret ENDPROC(aesni_gcm_enc) #endif .align 4 _key_expansion_128: _key_expansion_256a: pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movaps %xmm0, (TKEYP) add $0x10, TKEYP ret ENDPROC(_key_expansion_128) ENDPROC(_key_expansion_256a) .align 4 _key_expansion_192a: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movaps %xmm2, %xmm5 movaps %xmm2, %xmm6 pslldq $4, %xmm5 pshufd $0b11111111, %xmm0, %xmm3 pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 movaps %xmm0, %xmm1 shufps $0b01000100, %xmm0, %xmm6 movaps %xmm6, (TKEYP) shufps $0b01001110, %xmm2, %xmm1 movaps %xmm1, 0x10(TKEYP) add $0x20, TKEYP ret ENDPROC(_key_expansion_192a) .align 4 _key_expansion_192b: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movaps %xmm2, %xmm5 pslldq $4, %xmm5 pshufd $0b11111111, %xmm0, %xmm3 pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 movaps %xmm0, (TKEYP) add $0x10, TKEYP ret ENDPROC(_key_expansion_192b) .align 4 _key_expansion_256b: pshufd $0b10101010, %xmm1, %xmm1 shufps $0b00010000, %xmm2, %xmm4 pxor %xmm4, %xmm2 shufps $0b10001100, %xmm2, %xmm4 pxor %xmm4, %xmm2 pxor %xmm1, %xmm2 movaps %xmm2, (TKEYP) add $0x10, TKEYP ret ENDPROC(_key_expansion_256b) /* * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, * unsigned int key_len) */ ENTRY(aesni_set_key) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP movl (FRAME_OFFSET+8)(%esp), KEYP # ctx movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key movl (FRAME_OFFSET+16)(%esp), %edx # key_len #endif movups (UKEYP), %xmm0 # user key (first 16 bytes) movaps %xmm0, (KEYP) lea 0x10(KEYP), TKEYP # key addr movl %edx, 480(KEYP) pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x cmp $24, %dl jb .Lenc_key128 je .Lenc_key192 movups 0x10(UKEYP), %xmm2 # other user key movaps %xmm2, (TKEYP) add $0x10, TKEYP AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_256a AESKEYGENASSIST 0x1 %xmm0 %xmm1 call _key_expansion_256b AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_256a AESKEYGENASSIST 0x2 %xmm0 %xmm1 call _key_expansion_256b AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_256a AESKEYGENASSIST 0x4 %xmm0 %xmm1 call _key_expansion_256b AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_256a AESKEYGENASSIST 0x8 %xmm0 %xmm1 call _key_expansion_256b AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_256a AESKEYGENASSIST 0x10 %xmm0 %xmm1 call _key_expansion_256b AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_256a AESKEYGENASSIST 0x20 %xmm0 %xmm1 call _key_expansion_256b AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_256a jmp .Ldec_key .Lenc_key192: movq 0x10(UKEYP), %xmm2 # other user key AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_192a AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_192b AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_192a AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_192b AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_192a AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_192b AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_192a AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 call _key_expansion_192b jmp .Ldec_key .Lenc_key128: AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 call _key_expansion_128 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 call _key_expansion_128 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 call _key_expansion_128 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 call _key_expansion_128 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 call _key_expansion_128 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 call _key_expansion_128 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 call _key_expansion_128 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 call _key_expansion_128 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 call _key_expansion_128 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 call _key_expansion_128 .Ldec_key: sub $0x10, TKEYP movaps (KEYP), %xmm0 movaps (TKEYP), %xmm1 movaps %xmm0, 240(TKEYP) movaps %xmm1, 240(KEYP) add $0x10, KEYP lea 240-16(TKEYP), UKEYP .align 4 .Ldec_key_loop: movaps (KEYP), %xmm0 AESIMC %xmm0 %xmm1 movaps %xmm1, (UKEYP) add $0x10, KEYP sub $0x10, UKEYP cmp TKEYP, KEYP jb .Ldec_key_loop xor AREG, AREG #ifndef __x86_64__ popl KEYP #endif FRAME_END ret ENDPROC(aesni_set_key) /* * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_enc) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP pushl KLEN movl (FRAME_OFFSET+12)(%esp), KEYP # ctx movl (FRAME_OFFSET+16)(%esp), OUTP # dst movl (FRAME_OFFSET+20)(%esp), INP # src #endif movl 480(KEYP), KLEN # key length movups (INP), STATE # input call _aesni_enc1 movups STATE, (OUTP) # output #ifndef __x86_64__ popl KLEN popl KEYP #endif FRAME_END ret ENDPROC(aesni_enc) /* * _aesni_enc1: internal ABI * input: * KEYP: key struct pointer * KLEN: round count * STATE: initial state (input) * output: * STATE: finial state (output) * changed: * KEY * TKEYP (T1) */ .align 4 _aesni_enc1: movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE # round 0 add $0x30, TKEYP cmp $24, KLEN jb .Lenc128 lea 0x20(TKEYP), TKEYP je .Lenc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY AESENC KEY STATE movaps -0x50(TKEYP), KEY AESENC KEY STATE .align 4 .Lenc192: movaps -0x40(TKEYP), KEY AESENC KEY STATE movaps -0x30(TKEYP), KEY AESENC KEY STATE .align 4 .Lenc128: movaps -0x20(TKEYP), KEY AESENC KEY STATE movaps -0x10(TKEYP), KEY AESENC KEY STATE movaps (TKEYP), KEY AESENC KEY STATE movaps 0x10(TKEYP), KEY AESENC KEY STATE movaps 0x20(TKEYP), KEY AESENC KEY STATE movaps 0x30(TKEYP), KEY AESENC KEY STATE movaps 0x40(TKEYP), KEY AESENC KEY STATE movaps 0x50(TKEYP), KEY AESENC KEY STATE movaps 0x60(TKEYP), KEY AESENC KEY STATE movaps 0x70(TKEYP), KEY AESENCLAST KEY STATE ret ENDPROC(_aesni_enc1) /* * _aesni_enc4: internal ABI * input: * KEYP: key struct pointer * KLEN: round count * STATE1: initial state (input) * STATE2 * STATE3 * STATE4 * output: * STATE1: finial state (output) * STATE2 * STATE3 * STATE4 * changed: * KEY * TKEYP (T1) */ .align 4 _aesni_enc4: movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE1 # round 0 pxor KEY, STATE2 pxor KEY, STATE3 pxor KEY, STATE4 add $0x30, TKEYP cmp $24, KLEN jb .L4enc128 lea 0x20(TKEYP), TKEYP je .L4enc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps -0x50(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 #.align 4 .L4enc192: movaps -0x40(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps -0x30(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 #.align 4 .L4enc128: movaps -0x20(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps -0x10(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps (TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps 0x10(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps 0x20(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps 0x30(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps 0x40(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps 0x50(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps 0x60(TKEYP), KEY AESENC KEY STATE1 AESENC KEY STATE2 AESENC KEY STATE3 AESENC KEY STATE4 movaps 0x70(TKEYP), KEY AESENCLAST KEY STATE1 # last round AESENCLAST KEY STATE2 AESENCLAST KEY STATE3 AESENCLAST KEY STATE4 ret ENDPROC(_aesni_enc4) /* * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_dec) FRAME_BEGIN #ifndef __x86_64__ pushl KEYP pushl KLEN movl (FRAME_OFFSET+12)(%esp), KEYP # ctx movl (FRAME_OFFSET+16)(%esp), OUTP # dst movl (FRAME_OFFSET+20)(%esp), INP # src #endif mov 480(KEYP), KLEN # key length add $240, KEYP movups (INP), STATE # input call _aesni_dec1 movups STATE, (OUTP) #output #ifndef __x86_64__ popl KLEN popl KEYP #endif FRAME_END ret ENDPROC(aesni_dec) /* * _aesni_dec1: internal ABI * input: * KEYP: key struct pointer * KLEN: key length * STATE: initial state (input) * output: * STATE: finial state (output) * changed: * KEY * TKEYP (T1) */ .align 4 _aesni_dec1: movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE # round 0 add $0x30, TKEYP cmp $24, KLEN jb .Ldec128 lea 0x20(TKEYP), TKEYP je .Ldec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY AESDEC KEY STATE movaps -0x50(TKEYP), KEY AESDEC KEY STATE .align 4 .Ldec192: movaps -0x40(TKEYP), KEY AESDEC KEY STATE movaps -0x30(TKEYP), KEY AESDEC KEY STATE .align 4 .Ldec128: movaps -0x20(TKEYP), KEY AESDEC KEY STATE movaps -0x10(TKEYP), KEY AESDEC KEY STATE movaps (TKEYP), KEY AESDEC KEY STATE movaps 0x10(TKEYP), KEY AESDEC KEY STATE movaps 0x20(TKEYP), KEY AESDEC KEY STATE movaps 0x30(TKEYP), KEY AESDEC KEY STATE movaps 0x40(TKEYP), KEY AESDEC KEY STATE movaps 0x50(TKEYP), KEY AESDEC KEY STATE movaps 0x60(TKEYP), KEY AESDEC KEY STATE movaps 0x70(TKEYP), KEY AESDECLAST KEY STATE ret ENDPROC(_aesni_dec1) /* * _aesni_dec4: internal ABI * input: * KEYP: key struct pointer * KLEN: key length * STATE1: initial state (input) * STATE2 * STATE3 * STATE4 * output: * STATE1: finial state (output) * STATE2 * STATE3 * STATE4 * changed: * KEY * TKEYP (T1) */ .align 4 _aesni_dec4: movaps (KEYP), KEY # key mov KEYP, TKEYP pxor KEY, STATE1 # round 0 pxor KEY, STATE2 pxor KEY, STATE3 pxor KEY, STATE4 add $0x30, TKEYP cmp $24, KLEN jb .L4dec128 lea 0x20(TKEYP), TKEYP je .L4dec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps -0x50(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 .align 4 .L4dec192: movaps -0x40(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps -0x30(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 .align 4 .L4dec128: movaps -0x20(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps -0x10(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps (TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps 0x10(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps 0x20(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps 0x30(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps 0x40(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps 0x50(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps 0x60(TKEYP), KEY AESDEC KEY STATE1 AESDEC KEY STATE2 AESDEC KEY STATE3 AESDEC KEY STATE4 movaps 0x70(TKEYP), KEY AESDECLAST KEY STATE1 # last round AESDECLAST KEY STATE2 AESDECLAST KEY STATE3 AESDECLAST KEY STATE4 ret ENDPROC(_aesni_dec4) /* * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len) */ ENTRY(aesni_ecb_enc) FRAME_BEGIN #ifndef __x86_64__ pushl LEN pushl KEYP pushl KLEN movl (FRAME_OFFSET+16)(%esp), KEYP # ctx movl (FRAME_OFFSET+20)(%esp), OUTP # dst movl (FRAME_OFFSET+24)(%esp), INP # src movl (FRAME_OFFSET+28)(%esp), LEN # len #endif test LEN, LEN # check length jz .Lecb_enc_ret mov 480(KEYP), KLEN cmp $16, LEN jb .Lecb_enc_ret cmp $64, LEN jb .Lecb_enc_loop1 .align 4 .Lecb_enc_loop4: movups (INP), STATE1 movups 0x10(INP), STATE2 movups 0x20(INP), STATE3 movups 0x30(INP), STATE4 call _aesni_enc4 movups STATE1, (OUTP) movups STATE2, 0x10(OUTP) movups STATE3, 0x20(OUTP) movups STATE4, 0x30(OUTP) sub $64, LEN add $64, INP add $64, OUTP cmp $64, LEN jge .Lecb_enc_loop4 cmp $16, LEN jb .Lecb_enc_ret .align 4 .Lecb_enc_loop1: movups (INP), STATE1 call _aesni_enc1 movups STATE1, (OUTP) sub $16, LEN add $16, INP add $16, OUTP cmp $16, LEN jge .Lecb_enc_loop1 .Lecb_enc_ret: #ifndef __x86_64__ popl KLEN popl KEYP popl LEN #endif FRAME_END ret ENDPROC(aesni_ecb_enc) /* * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len); */ ENTRY(aesni_ecb_dec) FRAME_BEGIN #ifndef __x86_64__ pushl LEN pushl KEYP pushl KLEN movl (FRAME_OFFSET+16)(%esp), KEYP # ctx movl (FRAME_OFFSET+20)(%esp), OUTP # dst movl (FRAME_OFFSET+24)(%esp), INP # src movl (FRAME_OFFSET+28)(%esp), LEN # len #endif test LEN, LEN jz .Lecb_dec_ret mov 480(KEYP), KLEN add $240, KEYP cmp $16, LEN jb .Lecb_dec_ret cmp $64, LEN jb .Lecb_dec_loop1 .align 4 .Lecb_dec_loop4: movups (INP), STATE1 movups 0x10(INP), STATE2 movups 0x20(INP), STATE3 movups 0x30(INP), STATE4 call _aesni_dec4 movups STATE1, (OUTP) movups STATE2, 0x10(OUTP) movups STATE3, 0x20(OUTP) movups STATE4, 0x30(OUTP) sub $64, LEN add $64, INP add $64, OUTP cmp $64, LEN jge .Lecb_dec_loop4 cmp $16, LEN jb .Lecb_dec_ret .align 4 .Lecb_dec_loop1: movups (INP), STATE1 call _aesni_dec1 movups STATE1, (OUTP) sub $16, LEN add $16, INP add $16, OUTP cmp $16, LEN jge .Lecb_dec_loop1 .Lecb_dec_ret: #ifndef __x86_64__ popl KLEN popl KEYP popl LEN #endif FRAME_END ret ENDPROC(aesni_ecb_dec) /* * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ ENTRY(aesni_cbc_enc) FRAME_BEGIN #ifndef __x86_64__ pushl IVP pushl LEN pushl KEYP pushl KLEN movl (FRAME_OFFSET+20)(%esp), KEYP # ctx movl (FRAME_OFFSET+24)(%esp), OUTP # dst movl (FRAME_OFFSET+28)(%esp), INP # src movl (FRAME_OFFSET+32)(%esp), LEN # len movl (FRAME_OFFSET+36)(%esp), IVP # iv #endif cmp $16, LEN jb .Lcbc_enc_ret mov 480(KEYP), KLEN movups (IVP), STATE # load iv as initial state .align 4 .Lcbc_enc_loop: movups (INP), IN # load input pxor IN, STATE call _aesni_enc1 movups STATE, (OUTP) # store output sub $16, LEN add $16, INP add $16, OUTP cmp $16, LEN jge .Lcbc_enc_loop movups STATE, (IVP) .Lcbc_enc_ret: #ifndef __x86_64__ popl KLEN popl KEYP popl LEN popl IVP #endif FRAME_END ret ENDPROC(aesni_cbc_enc) /* * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ ENTRY(aesni_cbc_dec) FRAME_BEGIN #ifndef __x86_64__ pushl IVP pushl LEN pushl KEYP pushl KLEN movl (FRAME_OFFSET+20)(%esp), KEYP # ctx movl (FRAME_OFFSET+24)(%esp), OUTP # dst movl (FRAME_OFFSET+28)(%esp), INP # src movl (FRAME_OFFSET+32)(%esp), LEN # len movl (FRAME_OFFSET+36)(%esp), IVP # iv #endif cmp $16, LEN jb .Lcbc_dec_just_ret mov 480(KEYP), KLEN add $240, KEYP movups (IVP), IV cmp $64, LEN jb .Lcbc_dec_loop1 .align 4 .Lcbc_dec_loop4: movups (INP), IN1 movaps IN1, STATE1 movups 0x10(INP), IN2 movaps IN2, STATE2 #ifdef __x86_64__ movups 0x20(INP), IN3 movaps IN3, STATE3 movups 0x30(INP), IN4 movaps IN4, STATE4 #else movups 0x20(INP), IN1 movaps IN1, STATE3 movups 0x30(INP), IN2 movaps IN2, STATE4 #endif call _aesni_dec4 pxor IV, STATE1 #ifdef __x86_64__ pxor IN1, STATE2 pxor IN2, STATE3 pxor IN3, STATE4 movaps IN4, IV #else pxor IN1, STATE4 movaps IN2, IV movups (INP), IN1 pxor IN1, STATE2 movups 0x10(INP), IN2 pxor IN2, STATE3 #endif movups STATE1, (OUTP) movups STATE2, 0x10(OUTP) movups STATE3, 0x20(OUTP) movups STATE4, 0x30(OUTP) sub $64, LEN add $64, INP add $64, OUTP cmp $64, LEN jge .Lcbc_dec_loop4 cmp $16, LEN jb .Lcbc_dec_ret .align 4 .Lcbc_dec_loop1: movups (INP), IN movaps IN, STATE call _aesni_dec1 pxor IV, STATE movups STATE, (OUTP) movaps IN, IV sub $16, LEN add $16, INP add $16, OUTP cmp $16, LEN jge .Lcbc_dec_loop1 .Lcbc_dec_ret: movups IV, (IVP) .Lcbc_dec_just_ret: #ifndef __x86_64__ popl KLEN popl KEYP popl LEN popl IVP #endif FRAME_END ret ENDPROC(aesni_cbc_dec) #ifdef __x86_64__ .pushsection .rodata .align 16 .Lbswap_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 .popsection /* * _aesni_inc_init: internal ABI * setup registers used by _aesni_inc * input: * IV * output: * CTR: == IV, in little endian * TCTR_LOW: == lower qword of CTR * INC: == 1, in little endian * BSWAP_MASK == endian swapping mask */ .align 4 _aesni_inc_init: movaps .Lbswap_mask, BSWAP_MASK movaps IV, CTR PSHUFB_XMM BSWAP_MASK CTR mov $1, TCTR_LOW MOVQ_R64_XMM TCTR_LOW INC MOVQ_R64_XMM CTR TCTR_LOW ret ENDPROC(_aesni_inc_init) /* * _aesni_inc: internal ABI * Increase IV by 1, IV is in big endian * input: * IV * CTR: == IV, in little endian * TCTR_LOW: == lower qword of CTR * INC: == 1, in little endian * BSWAP_MASK == endian swapping mask * output: * IV: Increase by 1 * changed: * CTR: == output IV, in little endian * TCTR_LOW: == lower qword of CTR */ .align 4 _aesni_inc: paddq INC, CTR add $1, TCTR_LOW jnc .Linc_low pslldq $8, INC paddq INC, CTR psrldq $8, INC .Linc_low: movaps CTR, IV PSHUFB_XMM BSWAP_MASK IV ret ENDPROC(_aesni_inc) /* * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ ENTRY(aesni_ctr_enc) FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret mov 480(KEYP), KLEN movups (IVP), IV call _aesni_inc_init cmp $64, LEN jb .Lctr_enc_loop1 .align 4 .Lctr_enc_loop4: movaps IV, STATE1 call _aesni_inc movups (INP), IN1 movaps IV, STATE2 call _aesni_inc movups 0x10(INP), IN2 movaps IV, STATE3 call _aesni_inc movups 0x20(INP), IN3 movaps IV, STATE4 call _aesni_inc movups 0x30(INP), IN4 call _aesni_enc4 pxor IN1, STATE1 movups STATE1, (OUTP) pxor IN2, STATE2 movups STATE2, 0x10(OUTP) pxor IN3, STATE3 movups STATE3, 0x20(OUTP) pxor IN4, STATE4 movups STATE4, 0x30(OUTP) sub $64, LEN add $64, INP add $64, OUTP cmp $64, LEN jge .Lctr_enc_loop4 cmp $16, LEN jb .Lctr_enc_ret .align 4 .Lctr_enc_loop1: movaps IV, STATE call _aesni_inc movups (INP), IN call _aesni_enc1 pxor IN, STATE movups STATE, (OUTP) sub $16, LEN add $16, INP add $16, OUTP cmp $16, LEN jge .Lctr_enc_loop1 .Lctr_enc_ret: movups IV, (IVP) .Lctr_enc_just_ret: FRAME_END ret ENDPROC(aesni_ctr_enc) /* * _aesni_gf128mul_x_ble: internal ABI * Multiply in GF(2^128) for XTS IVs * input: * IV: current IV * GF128MUL_MASK == mask with 0x87 and 0x01 * output: * IV: next IV * changed: * CTR: == temporary value */ #define _aesni_gf128mul_x_ble() \ pshufd $0x13, IV, CTR; \ paddq IV, IV; \ psrad $31, CTR; \ pand GF128MUL_MASK, CTR; \ pxor CTR, IV; /* * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * bool enc, u8 *iv) */ ENTRY(aesni_xts_crypt8) FRAME_BEGIN cmpb $0, %cl movl $0, %ecx movl $240, %r10d leaq _aesni_enc4, %r11 leaq _aesni_dec4, %rax cmovel %r10d, %ecx cmoveq %rax, %r11 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK movups (IVP), IV mov 480(KEYP), KLEN addq %rcx, KEYP movdqa IV, STATE1 movdqu 0x00(INP), INC pxor INC, STATE1 movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 movdqu 0x10(INP), INC pxor INC, STATE2 movdqu IV, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 movdqu 0x20(INP), INC pxor INC, STATE3 movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 movdqu 0x30(INP), INC pxor INC, STATE4 movdqu IV, 0x30(OUTP) call *%r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 movdqu STATE1, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE1 movdqu 0x40(INP), INC pxor INC, STATE1 movdqu IV, 0x40(OUTP) movdqu 0x10(OUTP), INC pxor INC, STATE2 movdqu STATE2, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 movdqu 0x50(INP), INC pxor INC, STATE2 movdqu IV, 0x50(OUTP) movdqu 0x20(OUTP), INC pxor INC, STATE3 movdqu STATE3, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 movdqu 0x60(INP), INC pxor INC, STATE3 movdqu IV, 0x60(OUTP) movdqu 0x30(OUTP), INC pxor INC, STATE4 movdqu STATE4, 0x30(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 movdqu 0x70(INP), INC pxor INC, STATE4 movdqu IV, 0x70(OUTP) _aesni_gf128mul_x_ble() movups IV, (IVP) call *%r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 movdqu STATE1, 0x40(OUTP) movdqu 0x50(OUTP), INC pxor INC, STATE2 movdqu STATE2, 0x50(OUTP) movdqu 0x60(OUTP), INC pxor INC, STATE3 movdqu STATE3, 0x60(OUTP) movdqu 0x70(OUTP), INC pxor INC, STATE4 movdqu STATE4, 0x70(OUTP) FRAME_END ret ENDPROC(aesni_xts_crypt8) #endif