diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index b95778311178ba9234375da617b400bc2cea86af..84d7dbaba26e37d88489a6994ba36a2374788785 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,3 +48,4 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o crc32c-intel-y := crc32c-intel_glue.o +crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 493f959261f7274eaf69729a7add9a2664d475fd..6812ad98355c3d0e93be5e07c47e5e3956ebc8a4 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -32,6 +32,8 @@ #include #include +#include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -44,6 +46,31 @@ #define REX_PRE #endif +#ifdef CONFIG_X86_64 +/* + * use carryless multiply version of crc32c when buffer + * size is >= 512 (when eager fpu is enabled) or + * >= 1024 (when eager fpu is disabled) to account + * for fpu state save/restore overhead. + */ +#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 +#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 + +asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, + unsigned int crc_init); +static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; +#if defined(X86_FEATURE_EAGER_FPU) +#define set_pcl_breakeven_point() \ +do { \ + if (!use_eager_fpu()) \ + crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ +} while (0) +#else +#define set_pcl_breakeven_point() \ + (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) +#endif +#endif /* CONFIG_X86_64 */ + static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) { while (length--) { @@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm) return 0; } +#ifdef CONFIG_X86_64 +static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + /* + * use faster PCL version if datasize is large enough to + * overcome kernel fpu state save/restore overhead + */ + if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + kernel_fpu_begin(); + *crcp = crc_pcl(data, len, *crcp); + kernel_fpu_end(); + } else + *crcp = crc32c_intel_le_hw(*crcp, data, len); + return 0; +} + +static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + kernel_fpu_begin(); + *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); + kernel_fpu_end(); + } else + *(__le32 *)out = + ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); + return 0; +} + +static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +#endif /* CONFIG_X86_64 */ + static struct shash_alg alg = { .setkey = crc32c_intel_setkey, .init = crc32c_intel_init, @@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void) { if (!x86_match_cpu(crc32c_cpu_id)) return -ENODEV; +#ifdef CONFIG_X86_64 + if (cpu_has_pclmulqdq) { + alg.update = crc32c_pcl_intel_update; + alg.finup = crc32c_pcl_intel_finup; + alg.digest = crc32c_pcl_intel_digest; + set_pcl_breakeven_point(); + } +#endif return crypto_register_shash(&alg); } diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S new file mode 100644 index 0000000000000000000000000000000000000000..93c6d39237ac3753c1f1e7600351192715829a90 --- /dev/null +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -0,0 +1,460 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white paper on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://download.intel.com/design/intarch/papers/323405.pdf + * + * Copyright (C) 2012 Intel Corporation. + * + * Authors: + * Wajdi Feghali + * James Guilford + * David Cote + * Tim Chen + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JMPTBL_ENTRY i +.word crc_\i - crc_array +.endm + +.macro JNC_LESS_THAN j + jnc less_than_\j +.endm + +# Define threshold where buffers are considered "small" and routed to more +# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +# SMALL_SIZE can be no larger than 255. + +#define SMALL_SIZE 200 + +.if (SMALL_SIZE > 255) +.error "SMALL_ SIZE must be < 256" +.endif + +# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); + +.global crc_pcl +crc_pcl: +#define bufp %rdi +#define bufp_dw %edi +#define bufp_w %di +#define bufp_b %dil +#define bufptmp %rcx +#define block_0 %rcx +#define block_1 %rdx +#define block_2 %r11 +#define len %rsi +#define len_dw %esi +#define len_w %si +#define len_b %sil +#define crc_init_arg %rdx +#define tmp %rbx +#define crc_init %r8 +#define crc_init_dw %r8d +#define crc1 %r9 +#define crc2 %r10 + + pushq %rbx + pushq %rdi + pushq %rsi + + ## Move crc_init for Linux to a different + mov crc_init_arg, crc_init + + ################################################################ + ## 1) ALIGN: + ################################################################ + + mov bufp, bufptmp # rdi = *buf + neg bufp + and $7, bufp # calculate the unalignment amount of + # the address + je proc_block # Skip if aligned + + ## If len is less than 8 and we're unaligned, we need to jump + ## to special code to avoid reading beyond the end of the buffer + cmp $8, len + jae do_align + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $32-3+1, len_dw + jmp less_than_8_post_shl1 + +do_align: + #### Calculate CRC of unaligned bytes of the buffer (if any) + movq (bufptmp), tmp # load a quadward from the buffer + add bufp, bufptmp # align buffer pointer for quadword + # processing + sub bufp, len # update buffer length +align_loop: + crc32b %bl, crc_init_dw # compute crc32 of 1-byte + shr $8, tmp # get next byte + dec bufp + jne align_loop + +proc_block: + + ################################################################ + ## 2) PROCESS BLOCKS: + ################################################################ + + ## compute num of bytes to be processed + movq len, tmp # save num bytes in tmp + + cmpq $128*24, len + jae full_block + +continue_block: + cmpq $SMALL_SIZE, len + jb small + + ## len < 128*24 + movq $2731, %rax # 2731 = ceil(2^16 / 24) + mul len_dw + shrq $16, %rax + + ## eax contains floor(bytes / 24) = num 24-byte chunks to do + + ## process rax 24-byte chunks (128 >= rax >= 0) + + ## compute end address of each block + ## block 0 (base addr + RAX * 8) + ## block 1 (base addr + RAX * 16) + ## block 2 (base addr + RAX * 24) + lea (bufptmp, %rax, 8), block_0 + lea (block_0, %rax, 8), block_1 + lea (block_1, %rax, 8), block_2 + + xor crc1, crc1 + xor crc2, crc2 + + ## branch into array + lea jump_table(%rip), bufp + movzxw (bufp, %rax, 2), len + offset=crc_array-jump_table + lea offset(bufp, len, 1), bufp + jmp *bufp + + ################################################################ + ## 2a) PROCESS FULL BLOCKS: + ################################################################ +full_block: + movq $128,%rax + lea 128*8*2(block_0), block_1 + lea 128*8*3(block_0), block_2 + add $128*8*1, block_0 + + xor crc1,crc1 + xor crc2,crc2 + + # Fall thruogh into top of crc array (crc_128) + + ################################################################ + ## 3) CRC Array: + ################################################################ + +crc_array: + i=128 +.rept 128-1 +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 + crc32q -i*8(block_2), crc2 + i=(i-1) +.endr + +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 +# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet + + mov block_2, block_0 + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-16)(%rip), bufp # first entry is for idx 1 + shlq $3, %rax # rax *= 8 + subq %rax, tmp # tmp -= rax*8 + shlq $1, %rax + subq %rax, tmp # tmp -= rax*16 + # (total tmp -= rax*24) + addq %rax, bufp + + movdqa (bufp), %xmm0 # 2 consts: K1:K2 + + movq crc_init, %xmm1 # CRC for block 1 + pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor -i*8(block_2), %rax + mov crc2, crc_init + crc32 %rax, crc_init + +################################################################ +## 5) Check for end: +################################################################ + +LABEL crc_ 0 + mov tmp, len + cmp $128*24, tmp + jae full_block + cmp $24, tmp + jae continue_block + +less_than_24: + shl $32-4, len_dw # less_than_16 expects length + # in upper 4 bits of len_dw + jnc less_than_16 + crc32q (bufptmp), crc_init + crc32q 8(bufptmp), crc_init + jz do_return + add $16, bufptmp + # len is less than 8 if we got here + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $2, len_dw + jmp less_than_8_post_shl1 + + ####################################################################### + ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ####################################################################### +small: + shl $32-8, len_dw # Prepare len_dw for less_than_256 + j=256 +.rept 5 # j = {256, 128, 64, 32, 16} +.altmacro +LABEL less_than_ %j # less_than_j: Length should be in + # upper lg(j) bits of len_dw + j=(j/2) + shl $1, len_dw # Get next MSB + JNC_LESS_THAN %j +.noaltmacro + i=0 +.rept (j/8) + crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data + i=i+8 +.endr + jz do_return # Return if remaining length is zero + add $j, bufptmp # Advance buf +.endr + +less_than_8: # Length should be stored in + # upper 3 bits of len_dw + shl $1, len_dw +less_than_8_post_shl1: + jnc less_than_4 + crc32l (bufptmp), crc_init_dw # CRC of 4 bytes + jz do_return # return if remaining data is zero + add $4, bufptmp +less_than_4: # Length should be stored in + # upper 2 bits of len_dw + shl $1, len_dw + jnc less_than_2 + crc32w (bufptmp), crc_init_dw # CRC of 2 bytes + jz do_return # return if remaining data is zero + add $2, bufptmp +less_than_2: # Length should be stored in the MSB + # of len_dw + shl $1, len_dw + jnc less_than_1 + crc32b (bufptmp), crc_init_dw # CRC of 1 byte +less_than_1: # Length should be zero +do_return: + movq crc_init, %rax + popq %rsi + popq %rdi + popq %rbx + ret + + ################################################################ + ## jump table Table is 129 entries x 2 bytes each + ################################################################ +.align 4 +jump_table: + i=0 +.rept 129 +.altmacro +JMPTBL_ENTRY %i +.noaltmacro + i=i+1 +.endr + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 quad words each + ################################################################ +.data +.align 64 +K_table: + .quad 0x14cd00bd6,0x105ec76f0 + .quad 0x0ba4fc28e,0x14cd00bd6 + .quad 0x1d82c63da,0x0f20c0dfe + .quad 0x09e4addf8,0x0ba4fc28e + .quad 0x039d3b296,0x1384aa63a + .quad 0x102f9b8a2,0x1d82c63da + .quad 0x14237f5e6,0x01c291d04 + .quad 0x00d3b6092,0x09e4addf8 + .quad 0x0c96cfdc0,0x0740eef02 + .quad 0x18266e456,0x039d3b296 + .quad 0x0daece73e,0x0083a6eec + .quad 0x0ab7aff2a,0x102f9b8a2 + .quad 0x1248ea574,0x1c1733996 + .quad 0x083348832,0x14237f5e6 + .quad 0x12c743124,0x02ad91c30 + .quad 0x0b9e02b86,0x00d3b6092 + .quad 0x018b33a4e,0x06992cea2 + .quad 0x1b331e26a,0x0c96cfdc0 + .quad 0x17d35ba46,0x07e908048 + .quad 0x1bf2e8b8a,0x18266e456 + .quad 0x1a3e0968a,0x11ed1f9d8 + .quad 0x0ce7f39f4,0x0daece73e + .quad 0x061d82e56,0x0f1d0f55e + .quad 0x0d270f1a2,0x0ab7aff2a + .quad 0x1c3f5f66c,0x0a87ab8a8 + .quad 0x12ed0daac,0x1248ea574 + .quad 0x065863b64,0x08462d800 + .quad 0x11eef4f8e,0x083348832 + .quad 0x1ee54f54c,0x071d111a8 + .quad 0x0b3e32c28,0x12c743124 + .quad 0x0064f7f26,0x0ffd852c6 + .quad 0x0dd7e3b0c,0x0b9e02b86 + .quad 0x0f285651c,0x0dcb17aa4 + .quad 0x010746f3c,0x018b33a4e + .quad 0x1c24afea4,0x0f37c5aee + .quad 0x0271d9844,0x1b331e26a + .quad 0x08e766a0c,0x06051d5a2 + .quad 0x093a5f730,0x17d35ba46 + .quad 0x06cb08e5c,0x11d5ca20e + .quad 0x06b749fb2,0x1bf2e8b8a + .quad 0x1167f94f2,0x021f3d99c + .quad 0x0cec3662e,0x1a3e0968a + .quad 0x19329634a,0x08f158014 + .quad 0x0e6fc4e6a,0x0ce7f39f4 + .quad 0x08227bb8a,0x1a5e82106 + .quad 0x0b0cd4768,0x061d82e56 + .quad 0x13c2b89c4,0x188815ab2 + .quad 0x0d7a4825c,0x0d270f1a2 + .quad 0x10f5ff2ba,0x105405f3e + .quad 0x00167d312,0x1c3f5f66c + .quad 0x0f6076544,0x0e9adf796 + .quad 0x026f6a60a,0x12ed0daac + .quad 0x1a2adb74e,0x096638b34 + .quad 0x19d34af3a,0x065863b64 + .quad 0x049c3cc9c,0x1e50585a0 + .quad 0x068bce87a,0x11eef4f8e + .quad 0x1524fa6c6,0x19f1c69dc + .quad 0x16cba8aca,0x1ee54f54c + .quad 0x042d98888,0x12913343e + .quad 0x1329d9f7e,0x0b3e32c28 + .quad 0x1b1c69528,0x088f25a3a + .quad 0x02178513a,0x0064f7f26 + .quad 0x0e0ac139e,0x04e36f0b0 + .quad 0x0170076fa,0x0dd7e3b0c + .quad 0x141a1a2e2,0x0bd6f81f8 + .quad 0x16ad828b4,0x0f285651c + .quad 0x041d17b64,0x19425cbba + .quad 0x1fae1cc66,0x010746f3c + .quad 0x1a75b4b00,0x18db37e8a + .quad 0x0f872e54c,0x1c24afea4 + .quad 0x01e41e9fc,0x04c144932 + .quad 0x086d8e4d2,0x0271d9844 + .quad 0x160f7af7a,0x052148f02 + .quad 0x05bb8f1bc,0x08e766a0c + .quad 0x0a90fd27a,0x0a3c6f37a + .quad 0x0b3af077a,0x093a5f730 + .quad 0x04984d782,0x1d22c238e + .quad 0x0ca6ef3ac,0x06cb08e5c + .quad 0x0234e0b26,0x063ded06a + .quad 0x1d88abd4a,0x06b749fb2 + .quad 0x04597456a,0x04d56973c + .quad 0x0e9e28eb4,0x1167f94f2 + .quad 0x07b3ff57a,0x19385bf2e + .quad 0x0c9c8b782,0x0cec3662e + .quad 0x13a9cba9e,0x0e417f38a + .quad 0x093e106a4,0x19329634a + .quad 0x167001a9c,0x14e727980 + .quad 0x1ddffc5d4,0x0e6fc4e6a + .quad 0x00df04680,0x0d104b8fc + .quad 0x02342001e,0x08227bb8a + .quad 0x00a2a8d7e,0x05b397730 + .quad 0x168763fa6,0x0b0cd4768 + .quad 0x1ed5a407a,0x0e78eb416 + .quad 0x0d2c3ed1a,0x13c2b89c4 + .quad 0x0995a5724,0x1641378f0 + .quad 0x19b1afbc4,0x0d7a4825c + .quad 0x109ffedc0,0x08d96551c + .quad 0x0f2271e60,0x10f5ff2ba + .quad 0x00b0bf8ca,0x00bf80dd2 + .quad 0x123888b7a,0x00167d312 + .quad 0x1e888f7dc,0x18dcddd1c + .quad 0x002ee03b2,0x0f6076544 + .quad 0x183e8d8fe,0x06a45d2b2 + .quad 0x133d7a042,0x026f6a60a + .quad 0x116b0f50c,0x1dd3e10e8 + .quad 0x05fabe670,0x1a2adb74e + .quad 0x130004488,0x0de87806c + .quad 0x000bcf5f6,0x19d34af3a + .quad 0x18f0c7078,0x014338754 + .quad 0x017f27698,0x049c3cc9c + .quad 0x058ca5f00,0x15e3e77ee + .quad 0x1af900c24,0x068bce87a + .quad 0x0b5cfca28,0x0dd07448e + .quad 0x0ded288f8,0x1524fa6c6 + .quad 0x059f229bc,0x1d8048348 + .quad 0x06d390dec,0x16cba8aca + .quad 0x037170390,0x0a3e3e02c + .quad 0x06353c1cc,0x042d98888 + .quad 0x0c4584f5c,0x0d73c7bea + .quad 0x1f16a3418,0x1329d9f7e + .quad 0x0531377e2,0x185137662 + .quad 0x1d8d9ca7c,0x1b1c69528 + .quad 0x0b25b29f2,0x18a08b5bc + .quad 0x19fb2a8b0,0x02178513a + .quad 0x1a08fe6ac,0x1da758ae0 + .quad 0x045cddf4e,0x0e0ac139e + .quad 0x1a91647f2,0x169cf9eb0 + .quad 0x1a0f717c4,0x0170076fa diff --git a/crypto/Kconfig b/crypto/Kconfig index 6563366bae809f8e5b052e003ad42c6c2f2adb33..b901b590635f1b5ca7f44b1270db837cfee746ca 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -324,9 +324,19 @@ config CRYPTO_CRC32C by iSCSI for header and data digests and by others. See Castagnoli93. Module will be crc32c. +config CRYPTO_CRC32C_X86_64 + bool + depends on X86 && 64BIT + select CRYPTO_HASH + help + In Intel processor with SSE4.2 supported, the processor will + support CRC32C calculation using hardware accelerated CRC32 + instruction optimized with PCLMULQDQ instruction when available. + config CRYPTO_CRC32C_INTEL tristate "CRC32c INTEL hardware acceleration" depends on X86 + select CRYPTO_CRC32C_X86_64 if 64BIT select CRYPTO_HASH help In Intel processor with SSE4.2 supported, the processor will