Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu: "Here is the crypto update for 4.12: API: - Add batch registration for acomp/scomp - Change acomp testing to non-unique compressed result - Extend algorithm name limit to 128 bytes - Require setkey before accept(2) in algif_aead Algorithms: - Add support for deflate rfc1950 (zlib) Drivers: - Add accelerated crct10dif for powerpc - Add crc32 in stm32 - Add sha384/sha512 in ccp - Add 3des/gcm(aes) for v5 devices in ccp - Add Queue Interface (QI) backend support in caam - Add new Exynos RNG driver - Add ThunderX ZIP driver - Add driver for hardware random generator on MT7623 SoC" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (101 commits) crypto: stm32 - Fix OF module alias information crypto: algif_aead - Require setkey before accept(2) crypto: scomp - add support for deflate rfc1950 (zlib) crypto: scomp - allow registration of multiple scomps crypto: ccp - Change ISR handler method for a v5 CCP crypto: ccp - Change ISR handler method for a v3 CCP crypto: crypto4xx - rename ce_ring_contol to ce_ring_control crypto: testmgr - Allow ecb(cipher_null) in FIPS mode Revert "crypto: arm64/sha - Add constant operand modifier to ASM_EXPORT" crypto: ccp - Disable interrupts early on unload crypto: ccp - Use only the relevant interrupt bits hwrng: mtk - Add driver for hardware random generator on MT7623 SoC dt-bindings: hwrng: Add Mediatek hardware random generator bindings crypto: crct10dif-vpmsum - Fix missing preempt_disable() crypto: testmgr - replace compression known answer test crypto: acomp - allow registration of multiple acomps hwrng: n2 - Use devm_kcalloc() in n2rng_probe() crypto: chcr - Fix error handling related to 'chcr_alloc_shash' padata: get_next is never NULL crypto: exynos - Add new Exynos RNG driver ...

Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "Here is the crypto update for 4.12: API: - Add batch registration for acomp/scomp - Change acomp testing to non-unique compressed result - Extend algorithm name limit to 128 bytes - Require setkey before accept(2) in algif_aead Algorithms: - Add support for deflate rfc1950 (zlib) Drivers: - Add accelerated crct10dif for powerpc - Add crc32 in stm32 - Add sha384/sha512 in ccp - Add 3des/gcm(aes) for v5 devices in ccp - Add Queue Interface (QI) backend support in caam - Add new Exynos RNG driver - Add ThunderX ZIP driver - Add driver for hardware random generator on MT7623 SoC" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (101 commits) crypto: stm32 - Fix OF module alias information crypto: algif_aead - Require setkey before accept(2) crypto: scomp - add support for deflate rfc1950 (zlib) crypto: scomp - allow registration of multiple scomps crypto: ccp - Change ISR handler method for a v5 CCP crypto: ccp - Change ISR handler method for a v3 CCP crypto: crypto4xx - rename ce_ring_contol to ce_ring_control crypto: testmgr - Allow ecb(cipher_null) in FIPS mode Revert "crypto: arm64/sha - Add constant operand modifier to ASM_EXPORT" crypto: ccp - Disable interrupts early on unload crypto: ccp - Use only the relevant interrupt bits hwrng: mtk - Add driver for hardware random generator on MT7623 SoC dt-bindings: hwrng: Add Mediatek hardware random generator bindings crypto: crct10dif-vpmsum - Fix missing preempt_disable() crypto: testmgr - replace compression known answer test crypto: acomp - allow registration of multiple acomps hwrng: n2 - Use devm_kcalloc() in n2rng_probe() crypto: chcr - Fix error handling related to 'chcr_alloc_shash' padata: get_next is never NULL crypto: exynos - Add new Exynos RNG driver ...
5a0387a8 · Linus Torvalds · 204f144c · 929562b1 · 5a0387a8 · 5a0387a8
137 changed file
--- a/Documentation/crypto/api-samples.rst
+++ b/Documentation/crypto/api-samples.rst
@@ -155,9 +155,9 @@ Code Example For Use of Operational State Memory With SHASH
        char ctx[];
    };
-    static struct sdescinit_sdesc(struct crypto_shash *alg)
+    static struct sdesc init_sdesc(struct crypto_shash *alg)
    {
-        struct sdescsdesc;
+        struct sdesc sdesc;
        int size;
        size = sizeof(struct shash_desc) + crypto_shash_descsize(alg);
@@ -172,7 +172,7 @@ Code Example For Use of Operational State Memory With SHASH
    static int calc_hash(struct crypto_shashalg,
                 const unsigned chardata, unsigned int datalen,
                 unsigned chardigest) {
-        struct sdescsdesc;
+        struct sdesc sdesc;
        int ret;
        sdesc = init_sdesc(alg);

--- a/Documentation/devicetree/bindings/crypto/st,stm32-crc.txt
+++ b/Documentation/devicetree/bindings/crypto/st,stm32-crc.txt
+* STMicroelectronics STM32 CRC
+Required properties:
+- compatible: Should be "st,stm32f7-crc".
+- reg: The address and length of the peripheral registers space
+- clocks: The input clock of the CRC instance
+Optional properties: none
+Example:
+crc: crc@40023000 {
+	compatible = "st,stm32f7-crc";
+	reg = <0x40023000 0x400>;
+	clocks = <&rcc 0 12>;
+};
--- a/Documentation/devicetree/bindings/rng/amlogic,meson-rng.txt
+++ b/Documentation/devicetree/bindings/rng/amlogic,meson-rng.txt
@@ -6,9 +6,16 @@ Required properties:
 - compatible : should be "amlogic,meson-rng"
 - reg : Specifies base physical address and size of the registers.
+Optional properties:
+- clocks : phandle to the following named clocks
+- clock-names: Name of core clock, must be "core"
 Example:
 rng {
-        compatible = "amlogic,meson-rng";
+	compatible = "amlogic,meson-rng";
-        reg = <0x0 0xc8834000 0x0 0x4>;
+	reg = <0x0 0xc8834000 0x0 0x4>;
+	clocks = <&clkc CLKID_RNG0>;
+	clock-names = "core";
 };
--- a/Documentation/devicetree/bindings/rng/mtk-rng.txt
+++ b/Documentation/devicetree/bindings/rng/mtk-rng.txt
+Device-Tree bindings for Mediatek random number generator
+found in Mediatek SoC family
+Required properties:
+- compatible	    : Should be "mediatek,mt7623-rng"
+- clocks	    : list of clock specifiers, corresponding to
+		      entries in clock-names property;
+- clock-names	    : Should contain "rng" entries;
+- reg 		    : Specifies base physical address and size of the registers
+Example:
+rng: rng@1020f000 {
+	compatible = "mediatek,mt7623-rng";
+	reg = <0 0x1020f000 0 0x1000>;
+	clocks = <&infracfg CLK_INFRA_TRNG>;
+	clock-names = "rng";
+};
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6242,7 +6242,7 @@ F:	drivers/crypto/nx/nx_csbcpb.h
 F:	drivers/crypto/nx/nx_debugfs.h
 IBM Power 842 compression accelerator
-M:	Dan Streetman <ddstreet@ieee.org>
+M:	Haren Myneni <haren@us.ibm.com>
 S:	Supported
 F:	drivers/crypto/nx/Makefile
 F:	drivers/crypto/nx/Kconfig
@@ -10954,6 +10954,14 @@ L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:	Supported
 F:	sound/soc/samsung/
+SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER
+M:	Krzysztof Kozlowski <krzk@kernel.org>
+L:	linux-crypto@vger.kernel.org
+L:	linux-samsung-soc@vger.kernel.org
+S:	Maintained
+F:	drivers/crypto/exynos-rng.c
+F:	Documentation/devicetree/bindings/rng/samsung,exynos-rng4.txt
 SAMSUNG FRAMEBUFFER DRIVER
 M:	Jingoo Han <jingoohan1@gmail.com>
 L:	linux-fbdev@vger.kernel.org
@@ -10978,6 +10986,14 @@ F:	Documentation/devicetree/bindings/regulator/samsung,s2m*.txt
 F:	Documentation/devicetree/bindings/regulator/samsung,s5m*.txt
 F:	Documentation/devicetree/bindings/clock/samsung,s2mps11.txt
+SAMSUNG S5P Security SubSystem (SSS) DRIVER
+M:	Krzysztof Kozlowski <krzk@kernel.org>
+M:	Vladimir Zapolskiy <vz@mleia.com>
+L:	linux-crypto@vger.kernel.org
+L:	linux-samsung-soc@vger.kernel.org
+S:	Maintained
+F:	drivers/crypto/s5p-sss.c
 SAMSUNG S5P/EXYNOS4 SOC SERIES CAMERA SUBSYSTEM DRIVERS
 M:	Kyungmin Park <kyungmin.park@samsung.com>
 M:	Sylwester Nawrocki <s.nawrocki@samsung.com>

--- a/arch/arm/boot/dts/stm32746g-eval.dts
+++ b/arch/arm/boot/dts/stm32746g-eval.dts
@@ -89,6 +89,10 @@
 	clock-frequency = <25000000>;
 };
+&crc {
+	status = "okay";
+};
 &usart1 {
 	pinctrl-0 = <&usart1_pins_a>;
 	pinctrl-names = "default";

--- a/arch/arm/boot/dts/stm32f746.dtsi
+++ b/arch/arm/boot/dts/stm32f746.dtsi
@@ -289,6 +289,13 @@
 			};
 		};
+		crc: crc@40023000 {
+			compatible = "st,stm32f7-crc";
+			reg = <0x40023000 0x400>;
+			clocks = <&rcc 0 12>;
+			status = "disabled";
+		};
 		rcc: rcc@40023800 {
 			#clock-cells = <2>;
 			compatible = "st,stm32f42xx-rcc", "st,stm32-rcc";

--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -75,5 +75,7 @@ CONFIG_MAGIC_SYSRQ=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_BUGVERBOSE is not set
 # CONFIG_FTRACE is not set
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_DEV_STM32=y
 CONFIG_CRC_ITU_T=y
 CONFIG_CRC7=y
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -73,7 +73,7 @@ config CRYPTO_AES_ARM_BS
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_SIMD
-	select CRYPTO_AES_ARM
+	select CRYPTO_AES
 	help
 	  Use a faster and more secure NEON based implementation of AES in CBC,
 	  CTR and XTS modes

--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -42,9 +42,6 @@ asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
 asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
 				  int rounds, int blocks, u8 iv[]);
-asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, const u8 in[],
-				  u8 out[]);
 struct aesbs_ctx {
 	int	rounds;
 	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE);
@@ -52,12 +49,12 @@ struct aesbs_ctx {
 struct aesbs_cbc_ctx {
 	struct aesbs_ctx	key;
-	u32			enc[AES_MAX_KEYLENGTH_U32];
+	struct crypto_cipher	*enc_tfm;
 };
 struct aesbs_xts_ctx {
 	struct aesbs_ctx	key;
-	u32			twkey[AES_MAX_KEYLENGTH_U32];
+	struct crypto_cipher	*tweak_tfm;
 };
 static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
@@ -132,20 +129,18 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 	ctx->key.rounds = 6 + key_len / 4;
-	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
 	kernel_neon_begin();
 	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
 	kernel_neon_end();
-	return 0;
+	return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len);
 }
 static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
 {
 	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	__aes_arm_encrypt(ctx->enc, ctx->key.rounds, src, dst);
+	crypto_cipher_encrypt_one(ctx->enc_tfm, dst, src);
 }
 static int cbc_encrypt(struct skcipher_request *req)
@@ -181,6 +176,23 @@ static int cbc_decrypt(struct skcipher_request *req)
 	return err;
 }
+static int cbc_init(struct crypto_tfm *tfm)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+	ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(ctx->enc_tfm))
+		return PTR_ERR(ctx->enc_tfm);
+	return 0;
+}
+static void cbc_exit(struct crypto_tfm *tfm)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_cipher(ctx->enc_tfm);
+}
 static int ctr_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -228,7 +240,6 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			    unsigned int key_len)
 {
 	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct crypto_aes_ctx rk;
 	int err;
 	err = xts_verify_key(tfm, in_key, key_len);
@@ -236,15 +247,30 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 		return err;
 	key_len /= 2;
-	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+	err = crypto_cipher_setkey(ctx->tweak_tfm, in_key + key_len, key_len);
 	if (err)
 		return err;
-	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
 	return aesbs_setkey(tfm, in_key, key_len);
 }
+static int xts_init(struct crypto_tfm *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(ctx->tweak_tfm))
+		return PTR_ERR(ctx->tweak_tfm);
+	return 0;
+}
+static void xts_exit(struct crypto_tfm *tfm)
+{
+	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_cipher(ctx->tweak_tfm);
+}
 static int __xts_crypt(struct skcipher_request *req,
 		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
 				  int rounds, int blocks, u8 iv[]))
@@ -256,7 +282,7 @@ static int __xts_crypt(struct skcipher_request *req,
 	err = skcipher_walk_virt(&walk, req, true);
-	__aes_arm_encrypt(ctx->twkey, ctx->key.rounds, walk.iv, walk.iv);
+	crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv);
 	kernel_neon_begin();
 	while (walk.nbytes >= AES_BLOCK_SIZE) {
@@ -309,6 +335,8 @@ static struct skcipher_alg aes_algs[] = { {
 	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+	.base.cra_init		= cbc_init,
+	.base.cra_exit		= cbc_exit,
 	.min_keysize		= AES_MIN_KEY_SIZE,
 	.max_keysize		= AES_MAX_KEY_SIZE,
@@ -342,6 +370,8 @@ static struct skcipher_alg aes_algs[] = { {
 	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+	.base.cra_init		= xts_init,
+	.base.cra_exit		= xts_exit,
 	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
 	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
@@ -402,5 +432,5 @@ static int __init aes_init(void)
 	return err;
 }
-module_init(aes_init);
+late_initcall(aes_init);
 module_exit(aes_exit);
--- a/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gx.dtsi
@@ -380,7 +380,7 @@
 			#size-cells = <2>;
 			ranges = <0x0 0x0 0x0 0xc8834000 0x0 0x2000>;
-			rng {
+			hwrng: rng {
 				compatible = "amlogic,meson-rng";
 				reg = <0x0 0x0 0x0 0x4>;
 			};

--- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
@@ -524,3 +524,8 @@
 &vpu {
 	compatible = "amlogic,meson-gxbb-vpu", "amlogic,meson-gx-vpu";
 };
+&hwrng {
+	clocks = <&clkc CLKID_RNG0>;
+	clock-names = "core";
+};
--- a/arch/metag/kernel/stacktrace.c
+++ b/arch/metag/kernel/stacktrace.c
@@ -31,8 +31,6 @@ static void tbi_boing_init(void)
 }
 #endif
-#define ALIGN_DOWN(addr, size)  ((addr)&(~((size)-1)))
 /*
 * Unwind the current stack frame and store the new register values in the
 * structure passed as argument. Unwinding is equivalent to a function return,

--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
 obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
 obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
+obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
@@ -17,3 +19,4 @@ sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
 sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
 crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
+crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
--- a/arch/powerpc/crypto/crc-vpmsum_test.c
+++ b/arch/powerpc/crypto/crc-vpmsum_test.c
+/*
+ * CRC vpmsum tester
+ * Copyright 2017 Daniel Axtens, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/crc-t10dif.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+static unsigned long iterations = 10000;
+#define MAX_CRC_LENGTH 65535
+static int __init crc_test_init(void)
+{
+	u16 crc16 = 0, verify16 = 0;
+	u32 crc32 = 0, verify32 = 0;
+	__le32 verify32le = 0;
+	unsigned char *data;
+	unsigned long i;
+	int ret;
+	struct crypto_shash *crct10dif_tfm;
+	struct crypto_shash *crc32c_tfm;
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+	data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
+	if (IS_ERR(crct10dif_tfm)) {
+		pr_err("Error allocating crc-t10dif\n");
+		goto free_buf;
+	}
+	crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0);
+	if (IS_ERR(crc32c_tfm)) {
+		pr_err("Error allocating crc32c\n");
+		goto free_16;
+	}
+	do {
+		SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm);
+		SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm);
+		crct10dif_shash->tfm = crct10dif_tfm;
+		ret = crypto_shash_init(crct10dif_shash);
+		if (ret) {
+			pr_err("Error initing crc-t10dif\n");
+			goto free_32;
+		}
+		crc32c_shash->tfm = crc32c_tfm;
+		ret = crypto_shash_init(crc32c_shash);
+		if (ret) {
+			pr_err("Error initing crc32c\n");
+			goto free_32;
+		}
+		pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
+		for (i=0; i<iterations; i++) {
+			size_t len, offset;
+			get_random_bytes(data, MAX_CRC_LENGTH);
+			get_random_bytes(&len, sizeof(len));
+			get_random_bytes(&offset, sizeof(offset));
+			len %= MAX_CRC_LENGTH;
+			offset &= 15;
+			if (len <= offset)
+				continue;
+			len -= offset;
+			crypto_shash_update(crct10dif_shash, data+offset, len);
+			crypto_shash_final(crct10dif_shash, (u8 *)(&crc16));
+			verify16 = crc_t10dif_generic(verify16, data+offset, len);
+			if (crc16 != verify16) {
+				pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n",
+				       crc16, verify16, len);
+				break;
+			}
+			crypto_shash_update(crc32c_shash, data+offset, len);
+			crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
+			verify32 = le32_to_cpu(verify32le);
+		        verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
+			if (crc32 != (u32)verify32le) {
+				pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
+				       crc32, verify32, len);
+				break;
+			}
+		}
+		pr_info("crc-vpmsum_test done, completed %lu iterations\n", i);
+	} while (0);
+free_32:
+	crypto_free_shash(crc32c_tfm);
+free_16:
+	crypto_free_shash(crct10dif_tfm);
+free_buf:
+	kfree(data);
+	return 0;
+}
+static void __exit crc_test_exit(void) {}
+module_init(crc_test_init);
+module_exit(crc_test_exit);
+module_param(iterations, long, 0400);
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester");
+MODULE_LICENSE("GPL");
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
+/*
+ * Core of the accelerated CRC algorithm.
+ * In your file, define the constants and CRC_FUNCTION_NAME
+ * Then include this file.
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+*/
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+#define MAX_SIZE	32768
+	.text
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+#define const1		v24
+#define const2		v25
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC_FUNCTION_NAME)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+	mr	r10,r3
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, R3)
+#ifdef REFLECT
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+	cmpdi	r5,256
+	blt	.Lshort
+	rldicr	r6,r5,0,56
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+	/* Find the start of our constants */
+	add	r3,r3,r8
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+	lvx	const1,0,r3
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+	/* xor in initial value */
+	vxor	v16,v16,v8
+2:	bdz	.Lfirst_warm_up_done
+	addi	r3,r3,16
+	lvx	const2,0,r3
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+	bdz	.Lfirst_cool_down
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+	bdnz	4b
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+#ifdef REFLECT
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+#endif
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+	addi	r4,r4,8*16
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+	vxor	v0,v0,v4
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+#ifdef REFLECT
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+#endif
+	vand	v0,v0,mask_64bit
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
+	/* Get it into r3 */
+	MFVRD(R3, v0)
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+	blr
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+	b	.Lsecond_cool_down
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+	addi	r3,r3,128
+	addi	r4,r4,128
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+	vxor	v0,v19,v20
+	b	.Lbarrett_reduction
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+FUNC_END(CRC_FUNCTION_NAME)
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
 /*
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * Calculate a crc32c with vpmsum acceleration
- * 16 bytes.
- *
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
- * chunks in order to mask the latency of the vpmsum instructions. If we
- * have more than 32 kB of data to checksum we repeat this step multiple
- * times, passing in the previous 1024 bits.
- *
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
- * 32 bits of 0s to the end - this matches what a CRC does. We just
- * calculate constants that land the data in this 32 bits.
- *
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
- * for n = CRC using POWER8 instructions. We use x = 32.
- *
- * http://en.wikipedia.org/wiki/Barrett_reduction
 *
 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
 *
@@ -23,9 +8,6 @@
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
-#include <asm/ppc_asm.h>
-#include <asm/ppc-opcode.h>
 	.section	.rodata
 .balign 16
@@ -33,7 +15,6 @@
 	/* byte reverse permute constant */
 	.octa 0x0F0E0D0C0B0A09080706050403020100
-#define MAX_SIZE	32768
 .constants:
 	/* Reduce 262144 kbits to 1024 bits */
@@ -860,694 +841,6 @@
 	/* 33 bit reflected Barrett constant n */
 	.octa 0x00000000000000000000000105ec76f1
-	.text
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
-#if defined(__BIG_ENDIAN__)
+#include "crc32-vpmsum_core.S"
-#define BYTESWAP_DATA
-#else
-#undef BYTESWAP_DATA
-#endif
-#define off16		r25
-#define off32		r26
-#define off48		r27
-#define off64		r28
-#define off80		r29
-#define off96		r30
-#define off112		r31
-#define const1		v24
-#define const2		v25
-#define byteswap	v26
-#define	mask_32bit	v27
-#define	mask_64bit	v28
-#define zeroes		v29
-#ifdef BYTESWAP_DATA
-#define VPERM(A, B, C, D) vperm	A, B, C, D
-#else
-#define VPERM(A, B, C, D)
-#endif
-/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
-FUNC_START(__crc32c_vpmsum)
-	std	r31,-8(r1)
-	std	r30,-16(r1)
-	std	r29,-24(r1)
-	std	r28,-32(r1)
-	std	r27,-40(r1)
-	std	r26,-48(r1)
-	std	r25,-56(r1)
-	li	off16,16
-	li	off32,32
-	li	off48,48
-	li	off64,64
-	li	off80,80
-	li	off96,96
-	li	off112,112
-	li	r0,0
-	/* Enough room for saving 10 non volatile VMX registers */
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-	stvx	v20,0,r6
-	stvx	v21,off16,r6
-	stvx	v22,off32,r6
-	stvx	v23,off48,r6
-	stvx	v24,off64,r6
-	stvx	v25,off80,r6
-	stvx	v26,off96,r6
-	stvx	v27,off112,r6
-	stvx	v28,0,r7
-	stvx	v29,off16,r7
-	mr	r10,r3
-	vxor	zeroes,zeroes,zeroes
-	vspltisw v0,-1
-	vsldoi	mask_32bit,zeroes,v0,4
-	vsldoi	mask_64bit,zeroes,v0,8
-	/* Get the initial value into v8 */
-	vxor	v8,v8,v8
-	MTVRD(v8, R3)
-	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
-#ifdef BYTESWAP_DATA
-	addis	r3,r2,.byteswap_constant@toc@ha
-	addi	r3,r3,.byteswap_constant@toc@l
-	lvx	byteswap,0,r3
-	addi	r3,r3,16
-#endif
-	cmpdi	r5,256
-	blt	.Lshort
-	rldicr	r6,r5,0,56
-	/* Checksum in blocks of MAX_SIZE */
-1:	lis	r7,MAX_SIZE@h
-	ori	r7,r7,MAX_SIZE@l
-	mr	r9,r7
-	cmpd	r6,r7
-	bgt	2f
-	mr	r7,r6
-2:	subf	r6,r7,r6
-	/* our main loop does 128 bytes at a time */
-	srdi	r7,r7,7
-	/*
-	 * Work out the offset into the constants table to start at. Each
-	 * constant is 16 bytes, and it is used against 128 bytes of input
-	 * data - 128 / 16 = 8
-	 */
-	sldi	r8,r7,4
-	srdi	r9,r9,3
-	subf	r8,r8,r9
-	/* We reduce our final 128 bytes in a separate step */
-	addi	r7,r7,-1
-	mtctr	r7
-	addis	r3,r2,.constants@toc@ha
-	addi	r3,r3,.constants@toc@l
-	/* Find the start of our constants */
-	add	r3,r3,r8
-	/* zero v0-v7 which will contain our checksums */
-	vxor	v0,v0,v0
-	vxor	v1,v1,v1
-	vxor	v2,v2,v2
-	vxor	v3,v3,v3
-	vxor	v4,v4,v4
-	vxor	v5,v5,v5
-	vxor	v6,v6,v6
-	vxor	v7,v7,v7
-	lvx	const1,0,r3
-	/*
-	 * If we are looping back to consume more data we use the values
-	 * already in v16-v23.
-	 */
-	cmpdi	r0,1
-	beq	2f
-	/* First warm up pass */
-	lvx	v16,0,r4
-	lvx	v17,off16,r4
-	VPERM(v16,v16,v16,byteswap)
-	VPERM(v17,v17,v17,byteswap)
-	lvx	v18,off32,r4
-	lvx	v19,off48,r4
-	VPERM(v18,v18,v18,byteswap)
-	VPERM(v19,v19,v19,byteswap)
-	lvx	v20,off64,r4
-	lvx	v21,off80,r4
-	VPERM(v20,v20,v20,byteswap)
-	VPERM(v21,v21,v21,byteswap)
-	lvx	v22,off96,r4
-	lvx	v23,off112,r4
-	VPERM(v22,v22,v22,byteswap)
-	VPERM(v23,v23,v23,byteswap)
-	addi	r4,r4,8*16
-	/* xor in initial value */
-	vxor	v16,v16,v8
-2:	bdz	.Lfirst_warm_up_done
-	addi	r3,r3,16
-	lvx	const2,0,r3
-	/* Second warm up pass */
-	VPMSUMD(v8,v16,const1)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-	VPMSUMD(v9,v17,const1)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-	VPMSUMD(v10,v18,const1)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-	VPMSUMD(v11,v19,const1)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	ori	r2,r2,0
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-	addi	r4,r4,8*16
-	bdz	.Lfirst_cool_down
-	/*
-	 * main loop. We modulo schedule it such that it takes three iterations
-	 * to complete - first iteration load, second iteration vpmsum, third
-	 * iteration xor.
-	 */
-	.balign	16
-4:	lvx	const1,0,r3
-	addi	r3,r3,16
-	ori	r2,r2,0
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const2)
-	lvx	v16,0,r4
-	VPERM(v16,v16,v16,byteswap)
-	ori	r2,r2,0
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const2)
-	lvx	v17,off16,r4
-	VPERM(v17,v17,v17,byteswap)
-	ori	r2,r2,0
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const2)
-	lvx	v18,off32,r4
-	VPERM(v18,v18,v18,byteswap)
-	ori	r2,r2,0
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const2)
-	lvx	v19,off48,r4
-	VPERM(v19,v19,v19,byteswap)
-	lvx	const2,0,r3
-	ori	r2,r2,0
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	lvx	v20,off64,r4
-	VPERM(v20,v20,v20,byteswap)
-	ori	r2,r2,0
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	lvx	v21,off80,r4
-	VPERM(v21,v21,v21,byteswap)
-	ori	r2,r2,0
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	lvx	v22,off96,r4
-	VPERM(v22,v22,v22,byteswap)
-	ori	r2,r2,0
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	lvx	v23,off112,r4
-	VPERM(v23,v23,v23,byteswap)
-	addi	r4,r4,8*16
-	bdnz	4b
-.Lfirst_cool_down:
-	/* First cool down pass */
-	lvx	const1,0,r3
-	addi	r3,r3,16
-	vxor	v0,v0,v8
-	VPMSUMD(v8,v16,const1)
-	ori	r2,r2,0
-	vxor	v1,v1,v9
-	VPMSUMD(v9,v17,const1)
-	ori	r2,r2,0
-	vxor	v2,v2,v10
-	VPMSUMD(v10,v18,const1)
-	ori	r2,r2,0
-	vxor	v3,v3,v11
-	VPMSUMD(v11,v19,const1)
-	ori	r2,r2,0
-	vxor	v4,v4,v12
-	VPMSUMD(v12,v20,const1)
-	ori	r2,r2,0
-	vxor	v5,v5,v13
-	VPMSUMD(v13,v21,const1)
-	ori	r2,r2,0
-	vxor	v6,v6,v14
-	VPMSUMD(v14,v22,const1)
-	ori	r2,r2,0
-	vxor	v7,v7,v15
-	VPMSUMD(v15,v23,const1)
-	ori	r2,r2,0
-.Lsecond_cool_down:
-	/* Second cool down pass */
-	vxor	v0,v0,v8
-	vxor	v1,v1,v9
-	vxor	v2,v2,v10
-	vxor	v3,v3,v11
-	vxor	v4,v4,v12
-	vxor	v5,v5,v13
-	vxor	v6,v6,v14
-	vxor	v7,v7,v15
-	/*
-	 * vpmsumd produces a 96 bit result in the least significant bits
-	 * of the register. Since we are bit reflected we have to shift it
-	 * left 32 bits so it occupies the least significant bits in the
-	 * bit reflected domain.
-	 */
-	vsldoi	v0,v0,zeroes,4
-	vsldoi	v1,v1,zeroes,4
-	vsldoi	v2,v2,zeroes,4
-	vsldoi	v3,v3,zeroes,4
-	vsldoi	v4,v4,zeroes,4
-	vsldoi	v5,v5,zeroes,4
-	vsldoi	v6,v6,zeroes,4
-	vsldoi	v7,v7,zeroes,4
-	/* xor with last 1024 bits */
-	lvx	v8,0,r4
-	lvx	v9,off16,r4
-	VPERM(v8,v8,v8,byteswap)
-	VPERM(v9,v9,v9,byteswap)
-	lvx	v10,off32,r4
-	lvx	v11,off48,r4
-	VPERM(v10,v10,v10,byteswap)
-	VPERM(v11,v11,v11,byteswap)
-	lvx	v12,off64,r4
-	lvx	v13,off80,r4
-	VPERM(v12,v12,v12,byteswap)
-	VPERM(v13,v13,v13,byteswap)
-	lvx	v14,off96,r4
-	lvx	v15,off112,r4
-	VPERM(v14,v14,v14,byteswap)
-	VPERM(v15,v15,v15,byteswap)
-	addi	r4,r4,8*16
-	vxor	v16,v0,v8
-	vxor	v17,v1,v9
-	vxor	v18,v2,v10
-	vxor	v19,v3,v11
-	vxor	v20,v4,v12
-	vxor	v21,v5,v13
-	vxor	v22,v6,v14
-	vxor	v23,v7,v15
-	li	r0,1
-	cmpdi	r6,0
-	addi	r6,r6,128
-	bne	1b
-	/* Work out how many bytes we have left */
-	andi.	r5,r5,127
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,128
-	add	r3,r3,r6
-	/* How many 16 byte chunks are in the tail */
-	srdi	r7,r5,4
-	mtctr	r7
-	/*
-	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
-	 * 32 bits to include the trailing 32 bits of zeros
-	 */
-	lvx	v0,0,r3
-	lvx	v1,off16,r3
-	lvx	v2,off32,r3
-	lvx	v3,off48,r3
-	lvx	v4,off64,r3
-	lvx	v5,off80,r3
-	lvx	v6,off96,r3
-	lvx	v7,off112,r3
-	addi	r3,r3,8*16
-	VPMSUMW(v0,v16,v0)
-	VPMSUMW(v1,v17,v1)
-	VPMSUMW(v2,v18,v2)
-	VPMSUMW(v3,v19,v3)
-	VPMSUMW(v4,v20,v4)
-	VPMSUMW(v5,v21,v5)
-	VPMSUMW(v6,v22,v6)
-	VPMSUMW(v7,v23,v7)
-	/* Now reduce the tail (0 - 112 bytes) */
-	cmpdi	r7,0
-	beq	1f
-	lvx	v16,0,r4
-	lvx	v17,0,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-	lvx	v16,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-	lvx	v16,off32,r4
-	lvx	v17,off32,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-	lvx	v16,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-	lvx	v16,off64,r4
-	lvx	v17,off64,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-	lvx	v16,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	bdz	1f
-	lvx	v16,off96,r4
-	lvx	v17,off96,r3
-	VPERM(v16,v16,v16,byteswap)
-	VPMSUMW(v16,v16,v17)
-	vxor	v0,v0,v16
-	/* Now xor all the parallel chunks together */
-1:	vxor	v0,v0,v1
-	vxor	v2,v2,v3
-	vxor	v4,v4,v5
-	vxor	v6,v6,v7
-	vxor	v0,v0,v2
-	vxor	v4,v4,v6
-	vxor	v0,v0,v4
-.Lbarrett_reduction:
-	/* Barrett constants */
-	addis	r3,r2,.barrett_constants@toc@ha
-	addi	r3,r3,.barrett_constants@toc@l
-	lvx	const1,0,r3
-	lvx	const2,off16,r3
-	vsldoi	v1,v0,v0,8
-	vxor	v0,v0,v1		/* xor two 64 bit results together */
-	/* shift left one bit */
-	vspltisb v1,1
-	vsl	v0,v0,v1
-	vand	v0,v0,mask_64bit
-	/*
-	 * The reflected version of Barrett reduction. Instead of bit
-	 * reflecting our data (which is expensive to do), we bit reflect our
-	 * constants and our algorithm, which means the intermediate data in
-	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
-	 * the algorithm because we don't carry in mod 2 arithmetic.
-	 */
-	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
-	VPMSUMD(v1,v1,const1)		/* ma */
-	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
-	VPMSUMD(v1,v1,const2)		/* qn */
-	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
-	/*
-	 * Since we are bit reflected, the result (ie the low 32 bits) is in
-	 * the high 32 bits. We just need to shift it left 4 bytes
-	 * V0 [ 0 1 X 3 ]
-	 * V0 [ 0 X 2 3 ]
-	 */
-	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
-	/* Get it into r3 */
-	MFVRD(R3, v0)
-.Lout:
-	subi	r6,r1,56+10*16
-	subi	r7,r1,56+2*16
-	lvx	v20,0,r6
-	lvx	v21,off16,r6
-	lvx	v22,off32,r6
-	lvx	v23,off48,r6
-	lvx	v24,off64,r6
-	lvx	v25,off80,r6
-	lvx	v26,off96,r6
-	lvx	v27,off112,r6
-	lvx	v28,0,r7
-	lvx	v29,off16,r7
-	ld	r31,-8(r1)
-	ld	r30,-16(r1)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
-	blr
-.Lfirst_warm_up_done:
-	lvx	const1,0,r3
-	addi	r3,r3,16
-	VPMSUMD(v8,v16,const1)
-	VPMSUMD(v9,v17,const1)
-	VPMSUMD(v10,v18,const1)
-	VPMSUMD(v11,v19,const1)
-	VPMSUMD(v12,v20,const1)
-	VPMSUMD(v13,v21,const1)
-	VPMSUMD(v14,v22,const1)
-	VPMSUMD(v15,v23,const1)
-	b	.Lsecond_cool_down
-.Lshort:
-	cmpdi	r5,0
-	beq	.Lzero
-	addis	r3,r2,.short_constants@toc@ha
-	addi	r3,r3,.short_constants@toc@l
-	/* Calculate where in the constant table we need to start */
-	subfic	r6,r5,256
-	add	r3,r3,r6
-	/* How many 16 byte chunks? */
-	srdi	r7,r5,4
-	mtctr	r7
-	vxor	v19,v19,v19
-	vxor	v20,v20,v20
-	lvx	v0,0,r4
-	lvx	v16,0,r3
-	VPERM(v0,v0,v16,byteswap)
-	vxor	v0,v0,v8	/* xor in initial value */
-	VPMSUMW(v0,v0,v16)
-	bdz	.Lv0
-	lvx	v1,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v1,v1,v17,byteswap)
-	VPMSUMW(v1,v1,v17)
-	bdz	.Lv1
-	lvx	v2,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v2,v2,v16,byteswap)
-	VPMSUMW(v2,v2,v16)
-	bdz	.Lv2
-	lvx	v3,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v3,v3,v17,byteswap)
-	VPMSUMW(v3,v3,v17)
-	bdz	.Lv3
-	lvx	v4,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v4,v4,v16,byteswap)
-	VPMSUMW(v4,v4,v16)
-	bdz	.Lv4
-	lvx	v5,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v5,v5,v17,byteswap)
-	VPMSUMW(v5,v5,v17)
-	bdz	.Lv5
-	lvx	v6,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v6,v6,v16,byteswap)
-	VPMSUMW(v6,v6,v16)
-	bdz	.Lv6
-	lvx	v7,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v7,v7,v17,byteswap)
-	VPMSUMW(v7,v7,v17)
-	bdz	.Lv7
-	addi	r3,r3,128
-	addi	r4,r4,128
-	lvx	v8,0,r4
-	lvx	v16,0,r3
-	VPERM(v8,v8,v16,byteswap)
-	VPMSUMW(v8,v8,v16)
-	bdz	.Lv8
-	lvx	v9,off16,r4
-	lvx	v17,off16,r3
-	VPERM(v9,v9,v17,byteswap)
-	VPMSUMW(v9,v9,v17)
-	bdz	.Lv9
-	lvx	v10,off32,r4
-	lvx	v16,off32,r3
-	VPERM(v10,v10,v16,byteswap)
-	VPMSUMW(v10,v10,v16)
-	bdz	.Lv10
-	lvx	v11,off48,r4
-	lvx	v17,off48,r3
-	VPERM(v11,v11,v17,byteswap)
-	VPMSUMW(v11,v11,v17)
-	bdz	.Lv11
-	lvx	v12,off64,r4
-	lvx	v16,off64,r3
-	VPERM(v12,v12,v16,byteswap)
-	VPMSUMW(v12,v12,v16)
-	bdz	.Lv12
-	lvx	v13,off80,r4
-	lvx	v17,off80,r3
-	VPERM(v13,v13,v17,byteswap)
-	VPMSUMW(v13,v13,v17)
-	bdz	.Lv13
-	lvx	v14,off96,r4
-	lvx	v16,off96,r3
-	VPERM(v14,v14,v16,byteswap)
-	VPMSUMW(v14,v14,v16)
-	bdz	.Lv14
-	lvx	v15,off112,r4
-	lvx	v17,off112,r3
-	VPERM(v15,v15,v17,byteswap)
-	VPMSUMW(v15,v15,v17)
-.Lv15:	vxor	v19,v19,v15
-.Lv14:	vxor	v20,v20,v14
-.Lv13:	vxor	v19,v19,v13
-.Lv12:	vxor	v20,v20,v12
-.Lv11:	vxor	v19,v19,v11
-.Lv10:	vxor	v20,v20,v10
-.Lv9:	vxor	v19,v19,v9
-.Lv8:	vxor	v20,v20,v8
-.Lv7:	vxor	v19,v19,v7
-.Lv6:	vxor	v20,v20,v6
-.Lv5:	vxor	v19,v19,v5
-.Lv4:	vxor	v20,v20,v4
-.Lv3:	vxor	v19,v19,v3
-.Lv2:	vxor	v20,v20,v2
-.Lv1:	vxor	v19,v19,v1
-.Lv0:	vxor	v20,v20,v0
-	vxor	v0,v19,v20
-	b	.Lbarrett_reduction
-.Lzero:
-	mr	r3,r10
-	b	.Lout
-FUNC_END(__crc32_vpmsum)
--- a/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_asm.S
--- a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
+/*
+ * Calculate a CRC T10-DIF with vpmsum acceleration
+ *
+ * Copyright 2017, Daniel Axtens, IBM Corporation.
+ * [based on crc32c-vpmsum_glue.c]
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/switch_to.h>
+#define VMX_ALIGN		16
+#define VMX_ALIGN_MASK		(VMX_ALIGN-1)
+#define VECTOR_BREAKPOINT	64
+u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
+static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
+{
+	unsigned int prealign;
+	unsigned int tail;
+	u32 crc = crci;
+	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+		return crc_t10dif_generic(crc, p, len);
+	if ((unsigned long)p & VMX_ALIGN_MASK) {
+		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+		crc = crc_t10dif_generic(crc, p, prealign);
+		len -= prealign;
+		p += prealign;
+	}
+	if (len & ~VMX_ALIGN_MASK) {
+		crc <<= 16;
+		preempt_disable();
+		pagefault_disable();
+		enable_kernel_altivec();
+		crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+		disable_kernel_altivec();
+		pagefault_enable();
+		preempt_enable();
+		crc >>= 16;
+	}
+	tail = len & VMX_ALIGN_MASK;
+	if (tail) {
+		p += len & ~VMX_ALIGN_MASK;
+		crc = crc_t10dif_generic(crc, p, tail);
+	}
+	return crc & 0xffff;
+}
+static int crct10dif_vpmsum_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+	*crc = 0;
+	return 0;
+}
+static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data,
+			    unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+	*crc = crct10dif_vpmsum(*crc, data, length);
+	return 0;
+}
+static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crcp = shash_desc_ctx(desc);
+	*(u16 *)out = *crcp;
+	return 0;
+}
+static struct shash_alg alg = {
+	.init		= crct10dif_vpmsum_init,
+	.update		= crct10dif_vpmsum_update,
+	.final		= crct10dif_vpmsum_final,
+	.descsize	= CRC_T10DIF_DIGEST_SIZE,
+	.digestsize	= CRC_T10DIF_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "crct10dif",
+		.cra_driver_name	= "crct10dif-vpmsum",
+		.cra_priority		= 200,
+		.cra_blocksize		= CRC_T10DIF_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+};
+static int __init crct10dif_vpmsum_mod_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+	return crypto_register_shash(&alg);
+}
+static void __exit crct10dif_vpmsum_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init);
+module_exit(crct10dif_vpmsum_mod_fini);
+MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
+MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-vpmsum");
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -65,7 +65,6 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
-#define CONCAT(a,b)	a##b
 #define VMOVDQ		vmovdqu
 #define xdata0		%xmm0
@@ -92,8 +91,6 @@
 #define num_bytes	%r8
 #define tmp		%r10
-#define	DDQ(i)		CONCAT(ddq_add_,i)
-#define	XMM(i)		CONCAT(%xmm, i)
 #define	DDQ_DATA	0
 #define	XDATA		1
 #define KEY_128		1
@@ -131,12 +128,12 @@ ddq_add_8:
 /* generate a unique variable for ddq_add_x */
 .macro setddq n
-	var_ddq_add = DDQ(\n)
+	var_ddq_add = ddq_add_\n
 .endm
 /* generate a unique variable for xmm register */
 .macro setxdata n
-	var_xdata = XMM(\n)
+	var_xdata = %xmm\n
 .endm
 /* club the numeric 'id' to the symbol 'name' */

--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1522,7 +1522,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[2 * 4];
+	le128 buf[2 * 4];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
@@ -1540,7 +1540,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[2 * 4];
+	le128 buf[2 * 4];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),

--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
@@ -457,7 +458,7 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
 	le128 ivblk = *iv;
 	/* generate next IV */
-	le128_gf128mul_x_ble(iv, &ivblk);
+	gf128mul_x_ble(iv, &ivblk);
 	/* CC <- T xor C */
 	u128_xor(dst, src, (u128 *)&ivblk);

--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -328,7 +328,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	le128 buf[SERPENT_PARALLEL_BLOCKS];
 	struct crypt_priv crypt_ctx = {
 		.ctx = &ctx->crypt_ctx,
 		.fpu_enabled = false,
@@ -355,7 +355,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	le128 buf[SERPENT_PARALLEL_BLOCKS];
 	struct crypt_priv crypt_ctx = {
 		.ctx = &ctx->crypt_ctx,
 		.fpu_enabled = false,

--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -296,7 +296,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[3];
+	le128 buf[3];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),
@@ -314,7 +314,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[3];
+	le128 buf[3];
 	struct xts_crypt_req req = {
 		.tbuf = buf,
 		.tbuflen = sizeof(buf),

--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -125,16 +125,6 @@ static inline void le128_inc(le128 *i)
 	i->b = cpu_to_le64(b);
 }
-static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
-{
-	u64 a = le64_to_cpu(src->a);
-	u64 b = le64_to_cpu(src->b);
-	u64 _tt = ((s64)a >> 63) & 0x87;
-	dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
-	dst->b = cpu_to_le64((b << 1) ^ _tt);
-}
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 				 struct blkcipher_desc *desc,
 				 struct scatterlist *dst,

--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -374,7 +374,6 @@ config CRYPTO_XTS
 	tristate "XTS support"
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
-	select CRYPTO_GF128MUL
 	select CRYPTO_ECB
 	help
 	  XTS: IEEE1619/D16 narrow block cipher use with aes-xts-plain,
@@ -513,6 +512,23 @@ config CRYPTO_CRCT10DIF_PCLMUL
 	  'crct10dif-plcmul' module, which is faster when computing the
 	  crct10dif checksum as compared with the generic table implementation.
+config CRYPTO_CRCT10DIF_VPMSUM
+	tristate "CRC32T10DIF powerpc64 hardware acceleration"
+	depends on PPC64 && ALTIVEC && CRC_T10DIF
+	select CRYPTO_HASH
+	help
+	  CRC10T10DIF algorithm implemented using vector polynomial
+	  multiply-sum (vpmsum) instructions, introduced in POWER8. Enable on
+	  POWER8 and newer processors for improved performance.
+config CRYPTO_VPMSUM_TESTER
+	tristate "Powerpc64 vpmsum hardware acceleration tester"
+	depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM
+	help
+	  Stress test for CRC32c and CRC-T10DIF algorithms implemented with
+	  POWER8 vpmsum instructions.
+	  Unless you are testing these algorithms, you don't need this.
 config CRYPTO_GHASH
 	tristate "GHASH digest algorithm"
 	select CRYPTO_GF128MUL

--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -166,5 +166,34 @@ int crypto_unregister_acomp(struct acomp_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_acomp);
+int crypto_register_acomps(struct acomp_alg *algs, int count)
+{
+	int i, ret;
+	for (i = 0; i < count; i++) {
+		ret = crypto_register_acomp(&algs[i]);
+		if (ret)
+			goto err;
+	}
+	return 0;
+err:
+	for (--i; i >= 0; --i)
+		crypto_unregister_acomp(&algs[i]);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_register_acomps);
+void crypto_unregister_acomps(struct acomp_alg *algs, int count)
+{
+	int i;
+	for (i = count - 1; i >= 0; --i)
+		crypto_unregister_acomp(&algs[i]);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_acomps);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Asynchronous compression type");
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -160,11 +160,11 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (sock->state == SS_CONNECTED)
 		return -EINVAL;
-	if (addr_len != sizeof(*sa))
+	if (addr_len < sizeof(*sa))
 		return -EINVAL;
 	sa->salg_type[sizeof(sa->salg_type) - 1] = 0;
-	sa->salg_name[sizeof(sa->salg_name) - 1] = 0;
+	sa->salg_name[sizeof(sa->salg_name) + addr_len - sizeof(*sa) - 1] = 0;
 	type = alg_get_type(sa->salg_type);
 	if (IS_ERR(type) && PTR_ERR(type) == -ENOENT) {

--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -963,11 +963,11 @@ void crypto_inc(u8 *a, unsigned int size)
 	u32 c;
 	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-	    !((unsigned long)b & (__alignof__(*b) - 1)))
+	    IS_ALIGNED((unsigned long)b, __alignof__(*b)))
 		for (; size >= 4; size -= 4) {
 			c = be32_to_cpu(*--b) + 1;
 			*b = cpu_to_be32(c);
-			if (c)
+			if (likely(c))
 				return;
 		}

--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -45,6 +45,11 @@ struct aead_async_req {
 	char iv[];
 };
+struct aead_tfm {
+	struct crypto_aead *aead;
+	bool has_key;
+};
 struct aead_ctx {
 	struct aead_sg_list tsgl;
 	struct aead_async_rsgl first_rsgl;
@@ -723,24 +728,146 @@ static struct proto_ops algif_aead_ops = {
 	.poll		=	aead_poll,
 };
+static int aead_check_key(struct socket *sock)
+{
+	int err = 0;
+	struct sock *psk;
+	struct alg_sock *pask;
+	struct aead_tfm *tfm;
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	lock_sock(sk);
+	if (ask->refcnt)
+		goto unlock_child;
+	psk = ask->parent;
+	pask = alg_sk(ask->parent);
+	tfm = pask->private;
+	err = -ENOKEY;
+	lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
+	if (!tfm->has_key)
+		goto unlock;
+	if (!pask->refcnt++)
+		sock_hold(psk);
+	ask->refcnt = 1;
+	sock_put(psk);
+	err = 0;
+unlock:
+	release_sock(psk);
+unlock_child:
+	release_sock(sk);
+	return err;
+}
+static int aead_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
+				  size_t size)
+{
+	int err;
+	err = aead_check_key(sock);
+	if (err)
+		return err;
+	return aead_sendmsg(sock, msg, size);
+}
+static ssize_t aead_sendpage_nokey(struct socket *sock, struct page *page,
+				       int offset, size_t size, int flags)
+{
+	int err;
+	err = aead_check_key(sock);
+	if (err)
+		return err;
+	return aead_sendpage(sock, page, offset, size, flags);
+}
+static int aead_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
+				  size_t ignored, int flags)
+{
+	int err;
+	err = aead_check_key(sock);
+	if (err)
+		return err;
+	return aead_recvmsg(sock, msg, ignored, flags);
+}
+static struct proto_ops algif_aead_ops_nokey = {
+	.family		=	PF_ALG,
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+	.release	=	af_alg_release,
+	.sendmsg	=	aead_sendmsg_nokey,
+	.sendpage	=	aead_sendpage_nokey,
+	.recvmsg	=	aead_recvmsg_nokey,
+	.poll		=	aead_poll,
+};
 static void *aead_bind(const char *name, u32 type, u32 mask)
 {
-	return crypto_alloc_aead(name, type, mask);
+	struct aead_tfm *tfm;
+	struct crypto_aead *aead;
+	tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
+	if (!tfm)
+		return ERR_PTR(-ENOMEM);
+	aead = crypto_alloc_aead(name, type, mask);
+	if (IS_ERR(aead)) {
+		kfree(tfm);
+		return ERR_CAST(aead);
+	}
+	tfm->aead = aead;
+	return tfm;
 }
 static void aead_release(void *private)
 {
-	crypto_free_aead(private);
+	struct aead_tfm *tfm = private;
+	crypto_free_aead(tfm->aead);
+	kfree(tfm);
 }
 static int aead_setauthsize(void *private, unsigned int authsize)
 {
-	return crypto_aead_setauthsize(private, authsize);
+	struct aead_tfm *tfm = private;
+	return crypto_aead_setauthsize(tfm->aead, authsize);
 }
 static int aead_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-	return crypto_aead_setkey(private, key, keylen);
+	struct aead_tfm *tfm = private;
+	int err;
+	err = crypto_aead_setkey(tfm->aead, key, keylen);
+	tfm->has_key = !err;
+	return err;
 }
 static void aead_sock_destruct(struct sock *sk)
@@ -757,12 +884,14 @@ static void aead_sock_destruct(struct sock *sk)
 	af_alg_release_parent(sk);
 }
-static int aead_accept_parent(void *private, struct sock *sk)
+static int aead_accept_parent_nokey(void *private, struct sock *sk)
 {
 	struct aead_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
-	unsigned int len = sizeof(*ctx) + crypto_aead_reqsize(private);
+	struct aead_tfm *tfm = private;
-	unsigned int ivlen = crypto_aead_ivsize(private);
+	struct crypto_aead *aead = tfm->aead;
+	unsigned int len = sizeof(*ctx) + crypto_aead_reqsize(aead);
+	unsigned int ivlen = crypto_aead_ivsize(aead);
 	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
 	if (!ctx)
@@ -789,7 +918,7 @@ static int aead_accept_parent(void *private, struct sock *sk)
 	ask->private = ctx;
-	aead_request_set_tfm(&ctx->aead_req, private);
+	aead_request_set_tfm(&ctx->aead_req, aead);
 	aead_request_set_callback(&ctx->aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				  af_alg_complete, &ctx->completion);
@@ -798,13 +927,25 @@ static int aead_accept_parent(void *private, struct sock *sk)
 	return 0;
 }
+static int aead_accept_parent(void *private, struct sock *sk)
+{
+	struct aead_tfm *tfm = private;
+	if (!tfm->has_key)
+		return -ENOKEY;
+	return aead_accept_parent_nokey(private, sk);
+}
 static const struct af_alg_type algif_type_aead = {
 	.bind		=	aead_bind,
 	.release	=	aead_release,
 	.setkey		=	aead_setkey,
 	.setauthsize	=	aead_setauthsize,
 	.accept		=	aead_accept_parent,
+	.accept_nokey	=	aead_accept_parent_nokey,
 	.ops		=	&algif_aead_ops,
+	.ops_nokey	=	&algif_aead_ops_nokey,
 	.name		=	"aead",
 	.owner		=	THIS_MODULE
 };

--- a/crypto/cbc.c
+++ b/crypto/cbc.c
@@ -10,6 +10,7 @@
 *
 */
+#include <crypto/algapi.h>
 #include <crypto/cbc.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/err.h>
@@ -108,8 +109,10 @@ static void crypto_cbc_free(struct skcipher_instance *inst)
 static int crypto_cbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 {
 	struct skcipher_instance *inst;
+	struct crypto_attr_type *algt;
 	struct crypto_spawn *spawn;
 	struct crypto_alg *alg;
+	u32 mask;
 	int err;
 	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER);
@@ -120,8 +123,16 @@ static int crypto_cbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 	if (!inst)
 		return -ENOMEM;
-	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
+	algt = crypto_get_attr_type(tb);
-				  CRYPTO_ALG_TYPE_MASK);
+	err = PTR_ERR(algt);
+	if (IS_ERR(algt))
+		goto err_free_inst;
+	mask = CRYPTO_ALG_TYPE_MASK |
+		crypto_requires_off(algt->type, algt->mask,
+				    CRYPTO_ALG_NEED_FALLBACK);
+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER, mask);
 	err = PTR_ERR(alg);
 	if (IS_ERR(alg))
 		goto err_free_inst;

--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -83,7 +83,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_cipher rcipher;
-	strncpy(rcipher.type, "cipher", sizeof(rcipher.type));
+	strlcpy(rcipher.type, "cipher", sizeof(rcipher.type));
 	rcipher.blocksize = alg->cra_blocksize;
 	rcipher.min_keysize = alg->cra_cipher.cia_min_keysize;
@@ -102,7 +102,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_comp rcomp;
-	strncpy(rcomp.type, "compression", sizeof(rcomp.type));
+	strlcpy(rcomp.type, "compression", sizeof(rcomp.type));
 	if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS,
 		    sizeof(struct crypto_report_comp), &rcomp))
 		goto nla_put_failure;
@@ -116,7 +116,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_acomp racomp;
-	strncpy(racomp.type, "acomp", sizeof(racomp.type));
+	strlcpy(racomp.type, "acomp", sizeof(racomp.type));
 	if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP,
 		    sizeof(struct crypto_report_acomp), &racomp))
@@ -131,7 +131,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_akcipher rakcipher;
-	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+	strlcpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
 	if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
 		    sizeof(struct crypto_report_akcipher), &rakcipher))
@@ -146,7 +146,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_kpp rkpp;
-	strncpy(rkpp.type, "kpp", sizeof(rkpp.type));
+	strlcpy(rkpp.type, "kpp", sizeof(rkpp.type));
 	if (nla_put(skb, CRYPTOCFGA_REPORT_KPP,
 		    sizeof(struct crypto_report_kpp), &rkpp))
@@ -160,10 +160,10 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_one(struct crypto_alg *alg,
 			     struct crypto_user_alg *ualg, struct sk_buff *skb)
 {
-	strncpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
+	strlcpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
-	strncpy(ualg->cru_driver_name, alg->cra_driver_name,
+	strlcpy(ualg->cru_driver_name, alg->cra_driver_name,
 		sizeof(ualg->cru_driver_name));
-	strncpy(ualg->cru_module_name, module_name(alg->cra_module),
+	strlcpy(ualg->cru_module_name, module_name(alg->cra_module),
 		sizeof(ualg->cru_module_name));
 	ualg->cru_type = 0;
@@ -176,7 +176,7 @@ static int crypto_report_one(struct crypto_alg *alg,
 	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
 		struct crypto_report_larval rl;
-		strncpy(rl.type, "larval", sizeof(rl.type));
+		strlcpy(rl.type, "larval", sizeof(rl.type));
 		if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL,
 			    sizeof(struct crypto_report_larval), &rl))
 			goto nla_put_failure;

--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -181,15 +181,24 @@ static void crypto_ctr_exit_tfm(struct crypto_tfm *tfm)
 static struct crypto_instance *crypto_ctr_alloc(struct rtattr **tb)
 {
 	struct crypto_instance *inst;
+	struct crypto_attr_type *algt;
 	struct crypto_alg *alg;
+	u32 mask;
 	int err;
 	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
 	if (err)
 		return ERR_PTR(err);
-	alg = crypto_attr_alg(tb[1], CRYPTO_ALG_TYPE_CIPHER,
+	algt = crypto_get_attr_type(tb);
-				  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(algt))
+		return ERR_CAST(algt);
+	mask = CRYPTO_ALG_TYPE_MASK |
+		crypto_requires_off(algt->type, algt->mask,
+				    CRYPTO_ALG_NEED_FALLBACK);
+	alg = crypto_attr_alg(tb[1], CRYPTO_ALG_TYPE_CIPHER, mask);
 	if (IS_ERR(alg))
 		return ERR_CAST(alg);
@@ -350,6 +359,8 @@ static int crypto_rfc3686_create(struct crypto_template *tmpl,
 	struct skcipher_alg *alg;
 	struct crypto_skcipher_spawn *spawn;
 	const char *cipher_name;
+	u32 mask;
 	int err;
 	algt = crypto_get_attr_type(tb);
@@ -367,12 +378,14 @@ static int crypto_rfc3686_create(struct crypto_template *tmpl,
 	if (!inst)
 		return -ENOMEM;
+	mask = crypto_requires_sync(algt->type, algt->mask) |
+		crypto_requires_off(algt->type, algt->mask,
+				    CRYPTO_ALG_NEED_FALLBACK);
 	spawn = skcipher_instance_ctx(inst);
 	crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst));
-	err = crypto_grab_skcipher(spawn, cipher_name, 0,
+	err = crypto_grab_skcipher(spawn, cipher_name, 0, mask);
-				   crypto_requires_sync(algt->type,
-							algt->mask));
 	if (err)
 		goto err_free_inst;

--- a/crypto/deflate.c
+++ b/crypto/deflate.c
@@ -43,20 +43,24 @@ struct deflate_ctx {
 	struct z_stream_s decomp_stream;
 };
-static int deflate_comp_init(struct deflate_ctx *ctx)
+static int deflate_comp_init(struct deflate_ctx *ctx, int format)
 {
 	int ret = 0;
 	struct z_stream_s *stream = &ctx->comp_stream;
 	stream->workspace = vzalloc(zlib_deflate_workspacesize(
-				-DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL));
+				    MAX_WBITS, MAX_MEM_LEVEL));
 	if (!stream->workspace) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	ret = zlib_deflateInit2(stream, DEFLATE_DEF_LEVEL, Z_DEFLATED,
+	if (format)
-	                        -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL,
+		ret = zlib_deflateInit(stream, 3);
-	                        Z_DEFAULT_STRATEGY);
+	else
+		ret = zlib_deflateInit2(stream, DEFLATE_DEF_LEVEL, Z_DEFLATED,
+					-DEFLATE_DEF_WINBITS,
+					DEFLATE_DEF_MEMLEVEL,
+					Z_DEFAULT_STRATEGY);
 	if (ret != Z_OK) {
 		ret = -EINVAL;
 		goto out_free;
@@ -68,7 +72,7 @@ static int deflate_comp_init(struct deflate_ctx *ctx)
 	goto out;
 }
-static int deflate_decomp_init(struct deflate_ctx *ctx)
+static int deflate_decomp_init(struct deflate_ctx *ctx, int format)
 {
 	int ret = 0;
 	struct z_stream_s *stream = &ctx->decomp_stream;
@@ -78,7 +82,10 @@ static int deflate_decomp_init(struct deflate_ctx *ctx)
 		ret = -ENOMEM;
 		goto out;
 	}
-	ret = zlib_inflateInit2(stream, -DEFLATE_DEF_WINBITS);
+	if (format)
+		ret = zlib_inflateInit(stream);
+	else
+		ret = zlib_inflateInit2(stream, -DEFLATE_DEF_WINBITS);
 	if (ret != Z_OK) {
 		ret = -EINVAL;
 		goto out_free;
@@ -102,21 +109,21 @@ static void deflate_decomp_exit(struct deflate_ctx *ctx)
 	vfree(ctx->decomp_stream.workspace);
 }
-static int __deflate_init(void *ctx)
+static int __deflate_init(void *ctx, int format)
 {
 	int ret;
-	ret = deflate_comp_init(ctx);
+	ret = deflate_comp_init(ctx, format);
 	if (ret)
 		goto out;
-	ret = deflate_decomp_init(ctx);
+	ret = deflate_decomp_init(ctx, format);
 	if (ret)
 		deflate_comp_exit(ctx);
 out:
 	return ret;
 }
-static void *deflate_alloc_ctx(struct crypto_scomp *tfm)
+static void *gen_deflate_alloc_ctx(struct crypto_scomp *tfm, int format)
 {
 	struct deflate_ctx *ctx;
 	int ret;
@@ -125,7 +132,7 @@ static void *deflate_alloc_ctx(struct crypto_scomp *tfm)
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
-	ret = __deflate_init(ctx);
+	ret = __deflate_init(ctx, format);
 	if (ret) {
 		kfree(ctx);
 		return ERR_PTR(ret);
@@ -134,11 +141,21 @@ static void *deflate_alloc_ctx(struct crypto_scomp *tfm)
 	return ctx;
 }
+static void *deflate_alloc_ctx(struct crypto_scomp *tfm)
+{
+	return gen_deflate_alloc_ctx(tfm, 0);
+}
+static void *zlib_deflate_alloc_ctx(struct crypto_scomp *tfm)
+{
+	return gen_deflate_alloc_ctx(tfm, 1);
+}
 static int deflate_init(struct crypto_tfm *tfm)
 {
 	struct deflate_ctx *ctx = crypto_tfm_ctx(tfm);
-	return __deflate_init(ctx);
+	return __deflate_init(ctx, 0);
 }
 static void __deflate_exit(void *ctx)
@@ -272,7 +289,7 @@ static struct crypto_alg alg = {
 	.coa_decompress  	= deflate_decompress } }
 };
-static struct scomp_alg scomp = {
+static struct scomp_alg scomp[] = { {
 	.alloc_ctx		= deflate_alloc_ctx,
 	.free_ctx		= deflate_free_ctx,
 	.compress		= deflate_scompress,
@@ -282,7 +299,17 @@ static struct scomp_alg scomp = {
 		.cra_driver_name = "deflate-scomp",
 		.cra_module	 = THIS_MODULE,
 	}
-};
+}, {
+	.alloc_ctx		= zlib_deflate_alloc_ctx,
+	.free_ctx		= deflate_free_ctx,
+	.compress		= deflate_scompress,
+	.decompress		= deflate_sdecompress,
+	.base			= {
+		.cra_name	= "zlib-deflate",
+		.cra_driver_name = "zlib-deflate-scomp",
+		.cra_module	 = THIS_MODULE,
+	}
+} };
 static int __init deflate_mod_init(void)
 {
@@ -292,7 +319,7 @@ static int __init deflate_mod_init(void)
 	if (ret)
 		return ret;
-	ret = crypto_register_scomp(&scomp);
+	ret = crypto_register_scomps(scomp, ARRAY_SIZE(scomp));
 	if (ret) {
 		crypto_unregister_alg(&alg);
 		return ret;
@@ -304,7 +331,7 @@ static int __init deflate_mod_init(void)
 static void __exit deflate_mod_fini(void)
 {
 	crypto_unregister_alg(&alg);
-	crypto_unregister_scomp(&scomp);
+	crypto_unregister_scomps(scomp, ARRAY_SIZE(scomp));
 }
 module_init(deflate_mod_init);

--- a/crypto/dh.c
+++ b/crypto/dh.c
@@ -79,7 +79,8 @@ static int dh_set_params(struct dh_ctx *ctx, struct dh *params)
 	return 0;
 }
-static int dh_set_secret(struct crypto_kpp *tfm, void *buf, unsigned int len)
+static int dh_set_secret(struct crypto_kpp *tfm, const void *buf,
+			 unsigned int len)
 {
 	struct dh_ctx *ctx = dh_get_ctx(tfm);
 	struct dh params;

--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1749,17 +1749,16 @@ static int drbg_kcapi_sym_ctr(struct drbg_state *drbg,
 			      u8 *inbuf, u32 inlen,
 			      u8 *outbuf, u32 outlen)
 {
-	struct scatterlist sg_in;
+	struct scatterlist sg_in, sg_out;
 	int ret;
 	sg_init_one(&sg_in, inbuf, inlen);
+	sg_init_one(&sg_out, drbg->outscratchpad, DRBG_OUTSCRATCHLEN);
 	while (outlen) {
 		u32 cryptlen = min3(inlen, outlen, (u32)DRBG_OUTSCRATCHLEN);
-		struct scatterlist sg_out;
 		/* Output buffer may not be valid for SGL, use scratchpad */
-		sg_init_one(&sg_out, drbg->outscratchpad, cryptlen);
 		skcipher_request_set_crypt(drbg->ctr_req, &sg_in, &sg_out,
 					   cryptlen, drbg->V);
 		ret = crypto_skcipher_encrypt(drbg->ctr_req);

--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -38,7 +38,8 @@ static unsigned int ecdh_supported_curve(unsigned int curve_id)
 	}
 }
-static int ecdh_set_secret(struct crypto_kpp *tfm, void *buf, unsigned int len)
+static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf,
+			   unsigned int len)
 {
 	struct ecdh_ctx *ctx = ecdh_get_ctx(tfm);
 	struct ecdh params;

--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -44,7 +44,7 @@
 ---------------------------------------------------------------------------
 Issue 31/01/2006
- This file provides fast multiplication in GF(128) as required by several
+ This file provides fast multiplication in GF(2^128) as required by several
 cryptographic authentication modes
 */
@@ -88,76 +88,59 @@
 	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
 }
-/*	Given the value i in 0..255 as the byte overflow when a field element
+/*
-    in GHASH is multiplied by x^8, this function will return the values that
+ * Given a value i in 0..255 as the byte overflow when a field element
-    are generated in the lo 16-bit word of the field value by applying the
+ * in GF(2^128) is multiplied by x^8, the following macro returns the
-    modular polynomial. The values lo_byte and hi_byte are returned via the
+ * 16-bit value that must be XOR-ed into the low-degree end of the
-    macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into
+ * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
-    memory as required by a suitable definition of this macro operating on
+ *
-    the table above
+ * There are two versions of the macro, and hence two tables: one for
-*/
+ * the "be" convention where the highest-order bit is the coefficient of
+ * the highest-degree polynomial term, and one for the "le" convention
-#define xx(p, q)	0x##p##q
+ * where the highest-order bit is the coefficient of the lowest-degree
+ * polynomial term.  In both cases the values are stored in CPU byte
+ * endianness such that the coefficients are ordered consistently across
+ * bytes, i.e. in the "be" table bits 15..0 of the stored value
+ * correspond to the coefficients of x^15..x^0, and in the "le" table
+ * bits 15..0 correspond to the coefficients of x^0..x^15.
+ *
+ * Therefore, provided that the appropriate byte endianness conversions
+ * are done by the multiplication functions (and these must be in place
+ * anyway to support both little endian and big endian CPUs), the "be"
+ * table can be used for multiplications of both "bbe" and "ble"
+ * elements, and the "le" table can be used for multiplications of both
+ * "lle" and "lbe" elements.
+ */
-#define xda_bbe(i) ( \
+#define xda_be(i) ( \
-	(i & 0x80 ? xx(43, 80) : 0) ^ (i & 0x40 ? xx(21, c0) : 0) ^ \
+	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
-	(i & 0x20 ? xx(10, e0) : 0) ^ (i & 0x10 ? xx(08, 70) : 0) ^ \
+	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
-	(i & 0x08 ? xx(04, 38) : 0) ^ (i & 0x04 ? xx(02, 1c) : 0) ^ \
+	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
-	(i & 0x02 ? xx(01, 0e) : 0) ^ (i & 0x01 ? xx(00, 87) : 0) \
+	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
 )
-#define xda_lle(i) ( \
+#define xda_le(i) ( \
-	(i & 0x80 ? xx(e1, 00) : 0) ^ (i & 0x40 ? xx(70, 80) : 0) ^ \
+	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
-	(i & 0x20 ? xx(38, 40) : 0) ^ (i & 0x10 ? xx(1c, 20) : 0) ^ \
+	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
-	(i & 0x08 ? xx(0e, 10) : 0) ^ (i & 0x04 ? xx(07, 08) : 0) ^ \
+	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
-	(i & 0x02 ? xx(03, 84) : 0) ^ (i & 0x01 ? xx(01, c2) : 0) \
+	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
 )
-static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
+static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
-static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);
+static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
-/* These functions multiply a field element by x, by x^4 and by x^8
+/*
- * in the polynomial field representation. It uses 32-bit word operations
+ * The following functions multiply a field element by x^8 in
- * to gain speed but compensates for machine endianess and hence works
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
 * correctly on both styles of machine.
 */
-static void gf128mul_x_lle(be128 *r, const be128 *x)
-{
-	u64 a = be64_to_cpu(x->a);
-	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_lle[(b << 7) & 0xff];
-	r->b = cpu_to_be64((b >> 1) | (a << 63));
-	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
-}
-static void gf128mul_x_bbe(be128 *r, const be128 *x)
-{
-	u64 a = be64_to_cpu(x->a);
-	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[a >> 63];
-	r->a = cpu_to_be64((a << 1) | (b >> 63));
-	r->b = cpu_to_be64((b << 1) ^ _tt);
-}
-void gf128mul_x_ble(be128 *r, const be128 *x)
-{
-	u64 a = le64_to_cpu(x->a);
-	u64 b = le64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[b >> 63];
-	r->a = cpu_to_le64((a << 1) ^ _tt);
-	r->b = cpu_to_le64((b << 1) | (a >> 63));
-}
-EXPORT_SYMBOL(gf128mul_x_ble);
 static void gf128mul_x8_lle(be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_lle[b & 0xff];
+	u64 _tt = gf128mul_table_le[b & 0xff];
 	x->b = cpu_to_be64((b >> 8) | (a << 56));
 	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
@@ -167,7 +150,7 @@ static void gf128mul_x8_bbe(be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[a >> 56];
+	u64 _tt = gf128mul_table_be[a >> 56];
 	x->a = cpu_to_be64((a << 8) | (b >> 56));
 	x->b = cpu_to_be64((b << 8) ^ _tt);
@@ -251,7 +234,7 @@ EXPORT_SYMBOL(gf128mul_bbe);
 /*      This version uses 64k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
-    value in GF(128).  If we consider a GF(128) value in
+    value in GF(2^128).  If we consider a GF(2^128) value in
    the buffer's lowest byte, we can construct a table of
    the 256 16 byte values that result from the 256 values
    of this byte.  This requires 4096 bytes. But we also
@@ -315,7 +298,7 @@ void gf128mul_free_64k(struct gf128mul_64k *t)
 }
 EXPORT_SYMBOL(gf128mul_free_64k);
-void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
+void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t)
 {
 	u8 *ap = (u8 *)a;
 	be128 r[1];
@@ -330,7 +313,7 @@ EXPORT_SYMBOL(gf128mul_64k_bbe);
 /*      This version uses 4k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
-    value in GF(128).  If we consider a GF(128) value in a
+    value in GF(2^128).  If we consider a GF(2^128) value in a
    single byte, we can construct a table of the 256 16 byte
    values that result from the 256 values of this byte.
    This requires 4096 bytes. If we take the highest byte in
@@ -388,7 +371,7 @@ struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
 }
 EXPORT_SYMBOL(gf128mul_init_4k_bbe);
-void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
+void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
 {
 	u8 *ap = (u8 *)a;
 	be128 r[1];
@@ -403,7 +386,7 @@ void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
 }
 EXPORT_SYMBOL(gf128mul_4k_lle);
-void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
+void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
 {
 	u8 *ap = (u8 *)a;
 	be128 r[1];

--- a/crypto/lz4.c
+++ b/crypto/lz4.c
@@ -97,7 +97,7 @@ static int __lz4_decompress_crypto(const u8 *src, unsigned int slen,
 	int out_len = LZ4_decompress_safe(src, dst, slen, *dlen);
 	if (out_len < 0)
-		return out_len;
+		return -EINVAL;
 	*dlen = out_len;
 	return 0;

--- a/crypto/lz4hc.c
+++ b/crypto/lz4hc.c
@@ -98,7 +98,7 @@ static int __lz4hc_decompress_crypto(const u8 *src, unsigned int slen,
 	int out_len = LZ4_decompress_safe(src, dst, slen, *dlen);
 	if (out_len < 0)
-		return out_len;
+		return -EINVAL;
 	*dlen = out_len;
 	return 0;

--- a/crypto/md5.c
+++ b/crypto/md5.c
@@ -21,9 +21,11 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <linux/cryptohash.h>
 #include <asm/byteorder.h>
+#define MD5_DIGEST_WORDS 4
+#define MD5_MESSAGE_BYTES 64
 const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = {
 	0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04,
 	0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e,
@@ -47,6 +49,97 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
 	}
 }
+#define F1(x, y, z)	(z ^ (x & (y ^ z)))
+#define F2(x, y, z)	F1(z, x, y)
+#define F3(x, y, z)	(x ^ y ^ z)
+#define F4(x, y, z)	(y ^ (x | ~z))
+#define MD5STEP(f, w, x, y, z, in, s) \
+	(w += f(x, y, z) + in, w = (w<<s | w>>(32-s)) + x)
+static void md5_transform(__u32 *hash, __u32 const *in)
+{
+	u32 a, b, c, d;
+	a = hash[0];
+	b = hash[1];
+	c = hash[2];
+	d = hash[3];
+	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+	hash[0] += a;
+	hash[1] += b;
+	hash[2] += c;
+	hash[3] += d;
+}
 static inline void md5_transform_helper(struct md5_state *ctx)
 {
 	le32_to_cpu_array(ctx->block, sizeof(ctx->block) / sizeof(u32));

--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -353,5 +353,34 @@ int crypto_unregister_scomp(struct scomp_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_scomp);
+int crypto_register_scomps(struct scomp_alg *algs, int count)
+{
+	int i, ret;
+	for (i = 0; i < count; i++) {
+		ret = crypto_register_scomp(&algs[i]);
+		if (ret)
+			goto err;
+	}
+	return 0;
+err:
+	for (--i; i >= 0; --i)
+		crypto_unregister_scomp(&algs[i]);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_register_scomps);
+void crypto_unregister_scomps(struct scomp_alg *algs, int count)
+{
+	int i;
+	for (i = count - 1; i >= 0; --i)
+		crypto_unregister_scomp(&algs[i]);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_scomps);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Synchronous compression type");
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -83,47 +83,47 @@ struct tcrypt_result {
 struct aead_test_suite {
 	struct {
-		struct aead_testvec *vecs;
+		const struct aead_testvec *vecs;
 		unsigned int count;
 	} enc, dec;
 };
 struct cipher_test_suite {
 	struct {
-		struct cipher_testvec *vecs;
+		const struct cipher_testvec *vecs;
 		unsigned int count;
 	} enc, dec;
 };
 struct comp_test_suite {
 	struct {
-		struct comp_testvec *vecs;
+		const struct comp_testvec *vecs;
 		unsigned int count;
 	} comp, decomp;
 };
 struct hash_test_suite {
-	struct hash_testvec *vecs;
+	const struct hash_testvec *vecs;
 	unsigned int count;
 };
 struct cprng_test_suite {
-	struct cprng_testvec *vecs;
+	const struct cprng_testvec *vecs;
 	unsigned int count;
 };
 struct drbg_test_suite {
-	struct drbg_testvec *vecs;
+	const struct drbg_testvec *vecs;
 	unsigned int count;
 };
 struct akcipher_test_suite {
-	struct akcipher_testvec *vecs;
+	const struct akcipher_testvec *vecs;
 	unsigned int count;
 };
 struct kpp_test_suite {
-	struct kpp_testvec *vecs;
+	const struct kpp_testvec *vecs;
 	unsigned int count;
 };
@@ -145,7 +145,8 @@ struct alg_test_desc {
 	} suite;
 };
-static unsigned int IDX[8] = { IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 };
+static const unsigned int IDX[8] = {
+	IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 };
 static void hexdump(unsigned char *buf, unsigned int len)
 {
@@ -203,7 +204,7 @@ static int wait_async_op(struct tcrypt_result *tr, int ret)
 }
 static int ahash_partial_update(struct ahash_request **preq,
-	struct crypto_ahash *tfm, struct hash_testvec *template,
+	struct crypto_ahash *tfm, const struct hash_testvec *template,
 	void *hash_buff, int k, int temp, struct scatterlist *sg,
 	const char *algo, char *result, struct tcrypt_result *tresult)
 {
@@ -260,9 +261,9 @@ static int ahash_partial_update(struct ahash_request **preq,
 	return ret;
 }
-static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
+static int __test_hash(struct crypto_ahash *tfm,
-		       unsigned int tcount, bool use_digest,
+		       const struct hash_testvec *template, unsigned int tcount,
-		       const int align_offset)
+		       bool use_digest, const int align_offset)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
 	size_t digest_size = crypto_ahash_digestsize(tfm);
@@ -538,7 +539,8 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 	return ret;
 }
-static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
+static int test_hash(struct crypto_ahash *tfm,
+		     const struct hash_testvec *template,
 		     unsigned int tcount, bool use_digest)
 {
 	unsigned int alignmask;
@@ -566,7 +568,7 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 }
 static int __test_aead(struct crypto_aead *tfm, int enc,
-		       struct aead_testvec *template, unsigned int tcount,
+		       const struct aead_testvec *template, unsigned int tcount,
 		       const bool diff_dst, const int align_offset)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm));
@@ -957,7 +959,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
 }
 static int test_aead(struct crypto_aead *tfm, int enc,
-		     struct aead_testvec *template, unsigned int tcount)
+		     const struct aead_testvec *template, unsigned int tcount)
 {
 	unsigned int alignmask;
 	int ret;
@@ -990,7 +992,8 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 }
 static int test_cipher(struct crypto_cipher *tfm, int enc,
-		       struct cipher_testvec *template, unsigned int tcount)
+		       const struct cipher_testvec *template,
+		       unsigned int tcount)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_cipher_tfm(tfm));
 	unsigned int i, j, k;
@@ -1068,7 +1071,8 @@ static int test_cipher(struct crypto_cipher *tfm, int enc,
 }
 static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
-			   struct cipher_testvec *template, unsigned int tcount,
+			   const struct cipher_testvec *template,
+			   unsigned int tcount,
 			   const bool diff_dst, const int align_offset)
 {
 	const char *algo =
@@ -1332,7 +1336,8 @@ static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
 }
 static int test_skcipher(struct crypto_skcipher *tfm, int enc,
-			 struct cipher_testvec *template, unsigned int tcount)
+			 const struct cipher_testvec *template,
+			 unsigned int tcount)
 {
 	unsigned int alignmask;
 	int ret;
@@ -1364,8 +1369,10 @@ static int test_skcipher(struct crypto_skcipher *tfm, int enc,
 	return 0;
 }
-static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate,
+static int test_comp(struct crypto_comp *tfm,
-		     struct comp_testvec *dtemplate, int ctcount, int dtcount)
+		     const struct comp_testvec *ctemplate,
+		     const struct comp_testvec *dtemplate,
+		     int ctcount, int dtcount)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_comp_tfm(tfm));
 	unsigned int i;
@@ -1444,12 +1451,14 @@ static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate,
 	return ret;
 }
-static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
+static int test_acomp(struct crypto_acomp *tfm,
-		      struct comp_testvec *dtemplate, int ctcount, int dtcount)
+		      const struct comp_testvec *ctemplate,
+		      const struct comp_testvec *dtemplate,
+		      int ctcount, int dtcount)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm));
 	unsigned int i;
-	char *output;
+	char *output, *decomp_out;
 	int ret;
 	struct scatterlist src, dst;
 	struct acomp_req *req;
@@ -1459,6 +1468,12 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 	if (!output)
 		return -ENOMEM;
+	decomp_out = kmalloc(COMP_BUF_SIZE, GFP_KERNEL);
+	if (!decomp_out) {
+		kfree(output);
+		return -ENOMEM;
+	}
 	for (i = 0; i < ctcount; i++) {
 		unsigned int dlen = COMP_BUF_SIZE;
 		int ilen = ctemplate[i].inlen;
@@ -1497,7 +1512,23 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 			goto out;
 		}
-		if (req->dlen != ctemplate[i].outlen) {
+		ilen = req->dlen;
+		dlen = COMP_BUF_SIZE;
+		sg_init_one(&src, output, ilen);
+		sg_init_one(&dst, decomp_out, dlen);
+		init_completion(&result.completion);
+		acomp_request_set_params(req, &src, &dst, ilen, dlen);
+		ret = wait_async_op(&result, crypto_acomp_decompress(req));
+		if (ret) {
+			pr_err("alg: acomp: compression failed on test %d for %s: ret=%d\n",
+			       i + 1, algo, -ret);
+			kfree(input_vec);
+			acomp_request_free(req);
+			goto out;
+		}
+		if (req->dlen != ctemplate[i].inlen) {
 			pr_err("alg: acomp: Compression test %d failed for %s: output len = %d\n",
 			       i + 1, algo, req->dlen);
 			ret = -EINVAL;
@@ -1506,7 +1537,7 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 			goto out;
 		}
-		if (memcmp(output, ctemplate[i].output, req->dlen)) {
+		if (memcmp(input_vec, decomp_out, req->dlen)) {
 			pr_err("alg: acomp: Compression test %d failed for %s\n",
 			       i + 1, algo);
 			hexdump(output, req->dlen);
@@ -1584,11 +1615,13 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 	ret = 0;
 out:
+	kfree(decomp_out);
 	kfree(output);
 	return ret;
 }
-static int test_cprng(struct crypto_rng *tfm, struct cprng_testvec *template,
+static int test_cprng(struct crypto_rng *tfm,
+		      const struct cprng_testvec *template,
 		      unsigned int tcount)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_rng_tfm(tfm));
@@ -1865,7 +1898,7 @@ static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver,
 }
-static int drbg_cavs_test(struct drbg_testvec *test, int pr,
+static int drbg_cavs_test(const struct drbg_testvec *test, int pr,
 			  const char *driver, u32 type, u32 mask)
 {
 	int ret = -EAGAIN;
@@ -1939,7 +1972,7 @@ static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver,
 	int err = 0;
 	int pr = 0;
 	int i = 0;
-	struct drbg_testvec *template = desc->suite.drbg.vecs;
+	const struct drbg_testvec *template = desc->suite.drbg.vecs;
 	unsigned int tcount = desc->suite.drbg.count;
 	if (0 == memcmp(driver, "drbg_pr_", 8))
@@ -1958,7 +1991,7 @@ static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver,
 }
-static int do_test_kpp(struct crypto_kpp *tfm, struct kpp_testvec *vec,
+static int do_test_kpp(struct crypto_kpp *tfm, const struct kpp_testvec *vec,
 		       const char *alg)
 {
 	struct kpp_request *req;
@@ -2050,7 +2083,7 @@ static int do_test_kpp(struct crypto_kpp *tfm, struct kpp_testvec *vec,
 }
 static int test_kpp(struct crypto_kpp *tfm, const char *alg,
-		    struct kpp_testvec *vecs, unsigned int tcount)
+		    const struct kpp_testvec *vecs, unsigned int tcount)
 {
 	int ret, i;
@@ -2086,7 +2119,7 @@ static int alg_test_kpp(const struct alg_test_desc *desc, const char *driver,
 }
 static int test_akcipher_one(struct crypto_akcipher *tfm,
-			     struct akcipher_testvec *vecs)
+			     const struct akcipher_testvec *vecs)
 {
 	char *xbuf[XBUFSIZE];
 	struct akcipher_request *req;
@@ -2206,7 +2239,8 @@ static int test_akcipher_one(struct crypto_akcipher *tfm,
 }
 static int test_akcipher(struct crypto_akcipher *tfm, const char *alg,
-			 struct akcipher_testvec *vecs, unsigned int tcount)
+			 const struct akcipher_testvec *vecs,
+			 unsigned int tcount)
 {
 	const char *algo =
 		crypto_tfm_alg_driver_name(crypto_akcipher_tfm(tfm));
@@ -2634,6 +2668,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ctr(des3_ede)",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
 				.enc = __VECS(des3_ede_ctr_enc_tv_template),
@@ -2875,6 +2910,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "ecb(cipher_null)",
 		.test = alg_test_null,
+		.fips_allowed = 1,
 	}, {
 		.alg = "ecb(des)",
 		.test = alg_test_skcipher,
@@ -3477,6 +3513,16 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.dec = __VECS(tf_xts_dec_tv_template)
 			}
 		}
+	}, {
+		.alg = "zlib-deflate",
+		.test = alg_test_comp,
+		.fips_allowed = 1,
+		.suite = {
+			.comp = {
+				.comp = __VECS(zlib_deflate_comp_tv_template),
+				.decomp = __VECS(zlib_deflate_decomp_tv_template)
+			}
+		}
 	}
 };

--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -39,11 +39,11 @@ struct xts_instance_ctx {
 };
 struct rctx {
-	be128 buf[XTS_BUFFER_SIZE / sizeof(be128)];
+	le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
-	be128 t;
+	le128 t;
-	be128 *ext;
+	le128 *ext;
 	struct scatterlist srcbuf[2];
 	struct scatterlist dstbuf[2];
@@ -99,7 +99,7 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key,
 static int post_crypt(struct skcipher_request *req)
 {
 	struct rctx *rctx = skcipher_request_ctx(req);
-	be128 *buf = rctx->ext ?: rctx->buf;
+	le128 *buf = rctx->ext ?: rctx->buf;
 	struct skcipher_request *subreq;
 	const int bs = XTS_BLOCK_SIZE;
 	struct skcipher_walk w;
@@ -112,12 +112,12 @@ static int post_crypt(struct skcipher_request *req)
 	while (w.nbytes) {
 		unsigned int avail = w.nbytes;
-		be128 *wdst;
+		le128 *wdst;
 		wdst = w.dst.virt.addr;
 		do {
-			be128_xor(wdst, buf++, wdst);
+			le128_xor(wdst, buf++, wdst);
 			wdst++;
 		} while ((avail -= bs) >= bs);
@@ -150,7 +150,7 @@ static int post_crypt(struct skcipher_request *req)
 static int pre_crypt(struct skcipher_request *req)
 {
 	struct rctx *rctx = skcipher_request_ctx(req);
-	be128 *buf = rctx->ext ?: rctx->buf;
+	le128 *buf = rctx->ext ?: rctx->buf;
 	struct skcipher_request *subreq;
 	const int bs = XTS_BLOCK_SIZE;
 	struct skcipher_walk w;
@@ -174,15 +174,15 @@ static int pre_crypt(struct skcipher_request *req)
 	while (w.nbytes) {
 		unsigned int avail = w.nbytes;
-		be128 *wsrc;
+		le128 *wsrc;
-		be128 *wdst;
+		le128 *wdst;
 		wsrc = w.src.virt.addr;
 		wdst = w.dst.virt.addr;
 		do {
 			*buf++ = rctx->t;
-			be128_xor(wdst++, &rctx->t, wsrc++);
+			le128_xor(wdst++, &rctx->t, wsrc++);
 			gf128mul_x_ble(&rctx->t, &rctx->t);
 		} while ((avail -= bs) >= bs);
@@ -369,8 +369,8 @@ int xts_crypt(struct blkcipher_desc *desc, struct scatterlist *sdst,
 	const unsigned int max_blks = req->tbuflen / bsize;
 	struct blkcipher_walk walk;
 	unsigned int nblocks;
-	be128 *src, *dst, *t;
+	le128 *src, *dst, *t;
-	be128 *t_buf = req->tbuf;
+	le128 *t_buf = req->tbuf;
 	int err, i;
 	BUG_ON(max_blks < 1);
@@ -383,8 +383,8 @@ int xts_crypt(struct blkcipher_desc *desc, struct scatterlist *sdst,
 		return err;
 	nblocks = min(nbytes / bsize, max_blks);
-	src = (be128 *)walk.src.virt.addr;
+	src = (le128 *)walk.src.virt.addr;
-	dst = (be128 *)walk.dst.virt.addr;
+	dst = (le128 *)walk.dst.virt.addr;
 	/* calculate first value of T */
 	req->tweak_fn(req->tweak_ctx, (u8 *)&t_buf[0], walk.iv);
@@ -400,7 +400,7 @@ int xts_crypt(struct blkcipher_desc *desc, struct scatterlist *sdst,
 				t = &t_buf[i];
 				/* PP <- T xor P */
-				be128_xor(dst + i, t, src + i);
+				le128_xor(dst + i, t, src + i);
 			}
 			/* CC <- E(Key2,PP) */
@@ -409,7 +409,7 @@ int xts_crypt(struct blkcipher_desc *desc, struct scatterlist *sdst,
 			/* C <- T xor CC */
 			for (i = 0; i < nblocks; i++)
-				be128_xor(dst + i, dst + i, &t_buf[i]);
+				le128_xor(dst + i, dst + i, &t_buf[i]);
 			src += nblocks;
 			dst += nblocks;
@@ -417,7 +417,7 @@ int xts_crypt(struct blkcipher_desc *desc, struct scatterlist *sdst,
 			nblocks = min(nbytes / bsize, max_blks);
 		} while (nblocks > 0);
-		*(be128 *)walk.iv = *t;
+		*(le128 *)walk.iv = *t;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 		nbytes = walk.nbytes;
@@ -425,8 +425,8 @@ int xts_crypt(struct blkcipher_desc *desc, struct scatterlist *sdst,
 			break;
 		nblocks = min(nbytes / bsize, max_blks);
-		src = (be128 *)walk.src.virt.addr;
+		src = (le128 *)walk.src.virt.addr;
-		dst = (be128 *)walk.dst.virt.addr;
+		dst = (le128 *)walk.dst.virt.addr;
 	}
 	return err;

--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -294,20 +294,6 @@ config HW_RANDOM_POWERNV
 	  If unsure, say Y.
-config HW_RANDOM_EXYNOS
-	tristate "EXYNOS HW random number generator support"
-	depends on ARCH_EXYNOS || COMPILE_TEST
-	depends on HAS_IOMEM
-	default HW_RANDOM
-	---help---
-	  This driver provides kernel-side support for the Random Number
-	  Generator hardware found on EXYNOS SOCs.
-	  To compile this driver as a module, choose M here: the
-	  module will be called exynos-rng.
-	  If unsure, say Y.
 config HW_RANDOM_TPM
 	tristate "TPM HW Random Number Generator support"
 	depends on TCG_TPM
@@ -423,6 +409,20 @@ config HW_RANDOM_CAVIUM
         If unsure, say Y.
+config HW_RANDOM_MTK
+	tristate "Mediatek Random Number Generator support"
+	depends on HW_RANDOM
+	depends on ARCH_MEDIATEK || COMPILE_TEST
+	default y
+	---help---
+	  This driver provides kernel-side support for the Random Number
+	  Generator hardware found on Mediatek SoCs.
+	  To compile this driver as a module, choose M here. the
+	  module will be called mtk-rng.
+	  If unsure, say Y.
 config HW_RANDOM_S390
 	tristate "S390 True Random Number Generator support"
 	depends on S390

--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -24,7 +24,6 @@ obj-$(CONFIG_HW_RANDOM_OCTEON) += octeon-rng.o
 obj-$(CONFIG_HW_RANDOM_NOMADIK) += nomadik-rng.o
 obj-$(CONFIG_HW_RANDOM_PSERIES) += pseries-rng.o
 obj-$(CONFIG_HW_RANDOM_POWERNV) += powernv-rng.o
-obj-$(CONFIG_HW_RANDOM_EXYNOS)	+= exynos-rng.o
 obj-$(CONFIG_HW_RANDOM_HISI)	+= hisi-rng.o
 obj-$(CONFIG_HW_RANDOM_TPM) += tpm-rng.o
 obj-$(CONFIG_HW_RANDOM_BCM2835) += bcm2835-rng.o
@@ -36,4 +35,5 @@ obj-$(CONFIG_HW_RANDOM_STM32) += stm32-rng.o
 obj-$(CONFIG_HW_RANDOM_PIC32) += pic32-rng.o
 obj-$(CONFIG_HW_RANDOM_MESON) += meson-rng.o
 obj-$(CONFIG_HW_RANDOM_CAVIUM) += cavium-rng.o cavium-rng-vf.o
+obj-$(CONFIG_HW_RANDOM_MTK)	+= mtk-rng.o
 obj-$(CONFIG_HW_RANDOM_S390) += s390-trng.o
--- a/drivers/char/hw_random/exynos-rng.c
+++ b/drivers/char/hw_random/exynos-rng.c
-/*
- * exynos-rng.c - Random Number Generator driver for the exynos
- *
- * Copyright (C) 2012 Samsung Electronics
- * Jonghwa Lee <jonghwa3.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation;
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-#include <linux/hw_random.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/clk.h>
-#include <linux/pm_runtime.h>
-#include <linux/err.h>
-#define EXYNOS_PRNG_STATUS_OFFSET	0x10
-#define EXYNOS_PRNG_SEED_OFFSET		0x140
-#define EXYNOS_PRNG_OUT1_OFFSET		0x160
-#define SEED_SETTING_DONE		BIT(1)
-#define PRNG_START			0x18
-#define PRNG_DONE			BIT(5)
-#define EXYNOS_AUTOSUSPEND_DELAY	100
-struct exynos_rng {
-	struct device *dev;
-	struct hwrng rng;
-	void __iomem *mem;
-	struct clk *clk;
-};
-static u32 exynos_rng_readl(struct exynos_rng *rng, u32 offset)
-{
-	return	readl_relaxed(rng->mem + offset);
-}
-static void exynos_rng_writel(struct exynos_rng *rng, u32 val, u32 offset)
-{
-	writel_relaxed(val, rng->mem + offset);
-}
-static int exynos_rng_configure(struct exynos_rng *exynos_rng)
-{
-	int i;
-	int ret = 0;
-	for (i = 0 ; i < 5 ; i++)
-		exynos_rng_writel(exynos_rng, jiffies,
-				EXYNOS_PRNG_SEED_OFFSET + 4*i);
-	if (!(exynos_rng_readl(exynos_rng, EXYNOS_PRNG_STATUS_OFFSET)
-						 & SEED_SETTING_DONE))
-		ret = -EIO;
-	return ret;
-}
-static int exynos_init(struct hwrng *rng)
-{
-	struct exynos_rng *exynos_rng = container_of(rng,
-						struct exynos_rng, rng);
-	int ret = 0;
-	pm_runtime_get_sync(exynos_rng->dev);
-	ret = exynos_rng_configure(exynos_rng);
-	pm_runtime_mark_last_busy(exynos_rng->dev);
-	pm_runtime_put_autosuspend(exynos_rng->dev);
-	return ret;
-}
-static int exynos_read(struct hwrng *rng, void *buf,
-					size_t max, bool wait)
-{
-	struct exynos_rng *exynos_rng = container_of(rng,
-						struct exynos_rng, rng);
-	u32 *data = buf;
-	int retry = 100;
-	int ret = 4;
-	pm_runtime_get_sync(exynos_rng->dev);
-	exynos_rng_writel(exynos_rng, PRNG_START, 0);
-	while (!(exynos_rng_readl(exynos_rng,
-			EXYNOS_PRNG_STATUS_OFFSET) & PRNG_DONE) && --retry)
-		cpu_relax();
-	if (!retry) {
-		ret = -ETIMEDOUT;
-		goto out;
-	}
-	exynos_rng_writel(exynos_rng, PRNG_DONE, EXYNOS_PRNG_STATUS_OFFSET);
-	*data = exynos_rng_readl(exynos_rng, EXYNOS_PRNG_OUT1_OFFSET);
-out:
-	pm_runtime_mark_last_busy(exynos_rng->dev);
-	pm_runtime_put_sync_autosuspend(exynos_rng->dev);
-	return ret;
-}
-static int exynos_rng_probe(struct platform_device *pdev)
-{
-	struct exynos_rng *exynos_rng;
-	struct resource *res;
-	int ret;
-	exynos_rng = devm_kzalloc(&pdev->dev, sizeof(struct exynos_rng),
-					GFP_KERNEL);
-	if (!exynos_rng)
-		return -ENOMEM;
-	exynos_rng->dev = &pdev->dev;
-	exynos_rng->rng.name = "exynos";
-	exynos_rng->rng.init =	exynos_init;
-	exynos_rng->rng.read = exynos_read;
-	exynos_rng->clk = devm_clk_get(&pdev->dev, "secss");
-	if (IS_ERR(exynos_rng->clk)) {
-		dev_err(&pdev->dev, "Couldn't get clock.\n");
-		return -ENOENT;
-	}
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	exynos_rng->mem = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(exynos_rng->mem))
-		return PTR_ERR(exynos_rng->mem);
-	platform_set_drvdata(pdev, exynos_rng);
-	pm_runtime_set_autosuspend_delay(&pdev->dev, EXYNOS_AUTOSUSPEND_DELAY);
-	pm_runtime_use_autosuspend(&pdev->dev);
-	pm_runtime_enable(&pdev->dev);
-	ret = devm_hwrng_register(&pdev->dev, &exynos_rng->rng);
-	if (ret) {
-		pm_runtime_dont_use_autosuspend(&pdev->dev);
-		pm_runtime_disable(&pdev->dev);
-	}
-	return ret;
-}
-static int exynos_rng_remove(struct platform_device *pdev)
-{
-	pm_runtime_dont_use_autosuspend(&pdev->dev);
-	pm_runtime_disable(&pdev->dev);
-	return 0;
-}
-static int __maybe_unused exynos_rng_runtime_suspend(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct exynos_rng *exynos_rng = platform_get_drvdata(pdev);
-	clk_disable_unprepare(exynos_rng->clk);
-	return 0;
-}
-static int __maybe_unused exynos_rng_runtime_resume(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct exynos_rng *exynos_rng = platform_get_drvdata(pdev);
-	return clk_prepare_enable(exynos_rng->clk);
-}
-static int __maybe_unused exynos_rng_suspend(struct device *dev)
-{
-	return pm_runtime_force_suspend(dev);
-}
-static int __maybe_unused exynos_rng_resume(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct exynos_rng *exynos_rng = platform_get_drvdata(pdev);
-	int ret;
-	ret = pm_runtime_force_resume(dev);
-	if (ret)
-		return ret;
-	return exynos_rng_configure(exynos_rng);
-}
-static const struct dev_pm_ops exynos_rng_pm_ops = {
-	SET_SYSTEM_SLEEP_PM_OPS(exynos_rng_suspend, exynos_rng_resume)
-	SET_RUNTIME_PM_OPS(exynos_rng_runtime_suspend,
-			   exynos_rng_runtime_resume, NULL)
-};
-static const struct of_device_id exynos_rng_dt_match[] = {
-	{
-		.compatible = "samsung,exynos4-rng",
-	},
-	{ },
-};
-MODULE_DEVICE_TABLE(of, exynos_rng_dt_match);
-static struct platform_driver exynos_rng_driver = {
-	.driver		= {
-		.name	= "exynos-rng",
-		.pm	= &exynos_rng_pm_ops,
-		.of_match_table = exynos_rng_dt_match,
-	},
-	.probe		= exynos_rng_probe,
-	.remove		= exynos_rng_remove,
-};
-module_platform_driver(exynos_rng_driver);
-MODULE_DESCRIPTION("EXYNOS 4 H/W Random Number Generator driver");
-MODULE_AUTHOR("Jonghwa Lee <jonghwa3.lee@samsung.com>");
-MODULE_LICENSE("GPL");
--- a/drivers/char/hw_random/meson-rng.c
+++ b/drivers/char/hw_random/meson-rng.c
@@ -62,6 +62,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/of.h>
+#include <linux/clk.h>
 #define RNG_DATA 0x00
@@ -69,6 +70,7 @@ struct meson_rng_data {
 	void __iomem *base;
 	struct platform_device *pdev;
 	struct hwrng rng;
+	struct clk *core_clk;
 };
 static int meson_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
@@ -81,11 +83,17 @@ static int meson_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
 	return sizeof(u32);
 }
+static void meson_rng_clk_disable(void *data)
+{
+	clk_disable_unprepare(data);
+}
 static int meson_rng_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct meson_rng_data *data;
 	struct resource *res;
+	int ret;
 	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
 	if (!data)
@@ -98,6 +106,20 @@ static int meson_rng_probe(struct platform_device *pdev)
 	if (IS_ERR(data->base))
 		return PTR_ERR(data->base);
+	data->core_clk = devm_clk_get(dev, "core");
+	if (IS_ERR(data->core_clk))
+		data->core_clk = NULL;
+	if (data->core_clk) {
+		ret = clk_prepare_enable(data->core_clk);
+		if (ret)
+			return ret;
+		ret = devm_add_action_or_reset(dev, meson_rng_clk_disable,
+					       data->core_clk);
+		if (ret)
+			return ret;
+	}
 	data->rng.name = pdev->name;
 	data->rng.read = meson_rng_read;

--- a/drivers/char/hw_random/mtk-rng.c
+++ b/drivers/char/hw_random/mtk-rng.c
+/*
+ * Driver for Mediatek Hardware Random Number Generator
+ *
+ * Copyright (C) 2017 Sean Wang <sean.wang@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#define MTK_RNG_DEV KBUILD_MODNAME
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/hw_random.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#define USEC_POLL			2
+#define TIMEOUT_POLL			20
+#define RNG_CTRL			0x00
+#define RNG_EN				BIT(0)
+#define RNG_READY			BIT(31)
+#define RNG_DATA			0x08
+#define to_mtk_rng(p)	container_of(p, struct mtk_rng, rng)
+struct mtk_rng {
+	void __iomem *base;
+	struct clk *clk;
+	struct hwrng rng;
+};
+static int mtk_rng_init(struct hwrng *rng)
+{
+	struct mtk_rng *priv = to_mtk_rng(rng);
+	u32 val;
+	int err;
+	err = clk_prepare_enable(priv->clk);
+	if (err)
+		return err;
+	val = readl(priv->base + RNG_CTRL);
+	val |= RNG_EN;
+	writel(val, priv->base + RNG_CTRL);
+	return 0;
+}
+static void mtk_rng_cleanup(struct hwrng *rng)
+{
+	struct mtk_rng *priv = to_mtk_rng(rng);
+	u32 val;
+	val = readl(priv->base + RNG_CTRL);
+	val &= ~RNG_EN;
+	writel(val, priv->base + RNG_CTRL);
+	clk_disable_unprepare(priv->clk);
+}
+static bool mtk_rng_wait_ready(struct hwrng *rng, bool wait)
+{
+	struct mtk_rng *priv = to_mtk_rng(rng);
+	int ready;
+	ready = readl(priv->base + RNG_CTRL) & RNG_READY;
+	if (!ready && wait)
+		readl_poll_timeout_atomic(priv->base + RNG_CTRL, ready,
+					  ready & RNG_READY, USEC_POLL,
+					  TIMEOUT_POLL);
+	return !!ready;
+}
+static int mtk_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
+{
+	struct mtk_rng *priv = to_mtk_rng(rng);
+	int retval = 0;
+	while (max >= sizeof(u32)) {
+		if (!mtk_rng_wait_ready(rng, wait))
+			break;
+		*(u32 *)buf = readl(priv->base + RNG_DATA);
+		retval += sizeof(u32);
+		buf += sizeof(u32);
+		max -= sizeof(u32);
+	}
+	return retval || !wait ? retval : -EIO;
+}
+static int mtk_rng_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	int ret;
+	struct mtk_rng *priv;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "no iomem resource\n");
+		return -ENXIO;
+	}
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+	priv->rng.name = pdev->name;
+	priv->rng.init = mtk_rng_init;
+	priv->rng.cleanup = mtk_rng_cleanup;
+	priv->rng.read = mtk_rng_read;
+	priv->clk = devm_clk_get(&pdev->dev, "rng");
+	if (IS_ERR(priv->clk)) {
+		ret = PTR_ERR(priv->clk);
+		dev_err(&pdev->dev, "no clock for device: %d\n", ret);
+		return ret;
+	}
+	priv->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(priv->base))
+		return PTR_ERR(priv->base);
+	ret = devm_hwrng_register(&pdev->dev, &priv->rng);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register rng device: %d\n",
+			ret);
+		return ret;
+	}
+	dev_info(&pdev->dev, "registered RNG driver\n");
+	return 0;
+}
+static const struct of_device_id mtk_rng_match[] = {
+	{ .compatible = "mediatek,mt7623-rng" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, mtk_rng_match);
+static struct platform_driver mtk_rng_driver = {
+	.probe          = mtk_rng_probe,
+	.driver = {
+		.name = MTK_RNG_DEV,
+		.of_match_table = mtk_rng_match,
+	},
+};
+module_platform_driver(mtk_rng_driver);
+MODULE_DESCRIPTION("Mediatek Random Number Generator Driver");
+MODULE_AUTHOR("Sean Wang <sean.wang@mediatek.com>");
+MODULE_LICENSE("GPL");
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@@ -748,9 +748,7 @@ static int n2rng_probe(struct platform_device *op)
 	dev_info(&op->dev, "Registered RNG HVAPI major %lu minor %lu\n",
 		 np->hvapi_major, np->hvapi_minor);
+	np->units = devm_kcalloc(&op->dev, np->num_units, sizeof(*np->units),
-	np->units = devm_kzalloc(&op->dev,
-				 sizeof(struct n2rng_unit) * np->num_units,
 				 GFP_KERNEL);
 	err = -ENOMEM;
 	if (!np->units)

--- a/drivers/char/hw_random/omap-rng.c
+++ b/drivers/char/hw_random/omap-rng.c
@@ -398,16 +398,6 @@ static int of_get_omap_rng_device_details(struct omap_rng_dev *priv,
 			return err;
 		}
-		priv->clk = devm_clk_get(&pdev->dev, NULL);
-		if (IS_ERR(priv->clk) && PTR_ERR(priv->clk) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
-		if (!IS_ERR(priv->clk)) {
-			err = clk_prepare_enable(priv->clk);
-			if (err)
-				dev_err(&pdev->dev, "unable to enable the clk, "
-						    "err = %d\n", err);
-		}
 		/*
 		 * On OMAP4, enabling the shutdown_oflo interrupt is
 		 * done in the interrupt mask register. There is no
@@ -478,6 +468,18 @@ static int omap_rng_probe(struct platform_device *pdev)
 		goto err_ioremap;
 	}
+	priv->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(priv->clk) && PTR_ERR(priv->clk) == -EPROBE_DEFER)
+		return -EPROBE_DEFER;
+	if (!IS_ERR(priv->clk)) {
+		ret = clk_prepare_enable(priv->clk);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"Unable to enable the clk: %d\n", ret);
+			goto err_register;
+		}
+	}
 	ret = (dev->of_node) ? of_get_omap_rng_device_details(priv, pdev) :
 				get_omap_rng_device_details(priv);
 	if (ret)

--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
--- a/drivers/clk/meson/gxbb.h
+++ b/drivers/clk/meson/gxbb.h
@@ -193,7 +193,7 @@
 /* CLKID_I2C */
 /* #define CLKID_SAR_ADC */
 #define CLKID_SMART_CARD	  24
-#define CLKID_RNG0		  25
+/* CLKID_RNG0 */
 #define CLKID_UART0		  26
 #define CLKID_SDHC		  27
 #define CLKID_STREAM		  28

--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -2,9 +2,11 @@ obj-$(CONFIG_CRYPTO_DEV_ATMEL_AES) += atmel-aes.o
 obj-$(CONFIG_CRYPTO_DEV_ATMEL_SHA) += atmel-sha.o
 obj-$(CONFIG_CRYPTO_DEV_ATMEL_TDES) += atmel-tdes.o
 obj-$(CONFIG_CRYPTO_DEV_BFIN_CRC) += bfin_crc.o
+obj-$(CONFIG_CRYPTO_DEV_CAVIUM_ZIP) += cavium/
 obj-$(CONFIG_CRYPTO_DEV_CCP) += ccp/
 obj-$(CONFIG_CRYPTO_DEV_CHELSIO) += chelsio/
 obj-$(CONFIG_CRYPTO_DEV_CPT) += cavium/cpt/
+obj-$(CONFIG_CRYPTO_DEV_EXYNOS_RNG) += exynos-rng.o
 obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM) += caam/
 obj-$(CONFIG_CRYPTO_DEV_GEODE) += geode-aes.o
 obj-$(CONFIG_CRYPTO_DEV_HIFN_795X) += hifn_795x.o
@@ -30,6 +32,7 @@ obj-$(CONFIG_CRYPTO_DEV_QCE) += qce/
 obj-$(CONFIG_CRYPTO_DEV_ROCKCHIP) += rockchip/
 obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
 obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
+obj-$(CONFIG_CRYPTO_DEV_STM32) += stm32/
 obj-$(CONFIG_CRYPTO_DEV_SUN4I_SS) += sunxi-ss/
 obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o
 obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/

--- a/drivers/crypto/amcc/crypto4xx_core.c
+++ b/drivers/crypto/amcc/crypto4xx_core.c
@@ -50,7 +50,7 @@
 static void crypto4xx_hw_init(struct crypto4xx_device *dev)
 {
 	union ce_ring_size ring_size;
-	union ce_ring_contol ring_ctrl;
+	union ce_ring_control ring_ctrl;
 	union ce_part_ring_size part_ring_size;
 	union ce_io_threshold io_threshold;
 	u32 rand_num;

--- a/drivers/crypto/amcc/crypto4xx_reg_def.h
+++ b/drivers/crypto/amcc/crypto4xx_reg_def.h
@@ -180,7 +180,7 @@ union ce_ring_size {
 } __attribute__((packed));
 #define CRYPTO4XX_RING_CONTROL_OFFSET		0x54
-union ce_ring_contol {
+union ce_ring_control {
 	struct {
 		u32 continuous:1;
 		u32 rsv:5;

--- a/drivers/crypto/bcm/util.c
+++ b/drivers/crypto/bcm/util.c
--- a/drivers/crypto/caam/Kconfig
+++ b/drivers/crypto/caam/Kconfig
--- a/drivers/crypto/caam/Makefile
+++ b/drivers/crypto/caam/Makefile
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
--- a/drivers/crypto/caam/caamalg_desc.c
+++ b/drivers/crypto/caam/caamalg_desc.c
--- a/drivers/crypto/caam/caamalg_desc.h
+++ b/drivers/crypto/caam/caamalg_desc.h
--- a/drivers/crypto/caam/caamalg_qi.c
+++ b/drivers/crypto/caam/caamalg_qi.c
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
--- a/drivers/crypto/caam/intern.h
+++ b/drivers/crypto/caam/intern.h
--- a/drivers/crypto/caam/qi.c
+++ b/drivers/crypto/caam/qi.c
--- a/drivers/crypto/caam/qi.h
+++ b/drivers/crypto/caam/qi.h
--- a/drivers/crypto/caam/sg_sw_qm.h
+++ b/drivers/crypto/caam/sg_sw_qm.h
--- a/drivers/crypto/cavium/Makefile
+++ b/drivers/crypto/cavium/Makefile
--- a/drivers/crypto/cavium/zip/Makefile
+++ b/drivers/crypto/cavium/zip/Makefile
--- a/drivers/crypto/cavium/zip/common.h
+++ b/drivers/crypto/cavium/zip/common.h
--- a/drivers/crypto/cavium/zip/zip_crypto.c
+++ b/drivers/crypto/cavium/zip/zip_crypto.c
--- a/drivers/crypto/cavium/zip/zip_crypto.h
+++ b/drivers/crypto/cavium/zip/zip_crypto.h
--- a/drivers/crypto/cavium/zip/zip_deflate.c
+++ b/drivers/crypto/cavium/zip/zip_deflate.c
--- a/drivers/crypto/cavium/zip/zip_deflate.h
+++ b/drivers/crypto/cavium/zip/zip_deflate.h
--- a/drivers/crypto/cavium/zip/zip_device.c
+++ b/drivers/crypto/cavium/zip/zip_device.c
--- a/drivers/crypto/cavium/zip/zip_device.h
+++ b/drivers/crypto/cavium/zip/zip_device.h
--- a/drivers/crypto/cavium/zip/zip_inflate.c
+++ b/drivers/crypto/cavium/zip/zip_inflate.c
--- a/drivers/crypto/cavium/zip/zip_inflate.h
+++ b/drivers/crypto/cavium/zip/zip_inflate.h
--- a/drivers/crypto/cavium/zip/zip_main.c
+++ b/drivers/crypto/cavium/zip/zip_main.c
--- a/drivers/crypto/cavium/zip/zip_main.h
+++ b/drivers/crypto/cavium/zip/zip_main.h
--- a/drivers/crypto/cavium/zip/zip_mem.c
+++ b/drivers/crypto/cavium/zip/zip_mem.c
--- a/drivers/crypto/cavium/zip/zip_mem.h
+++ b/drivers/crypto/cavium/zip/zip_mem.h
--- a/drivers/crypto/cavium/zip/zip_regs.h
+++ b/drivers/crypto/cavium/zip/zip_regs.h
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
--- a/drivers/crypto/ccp/ccp-crypto-aes-galois.c
+++ b/drivers/crypto/ccp/ccp-crypto-aes-galois.c
--- a/drivers/crypto/ccp/ccp-crypto-des3.c
+++ b/drivers/crypto/ccp/ccp-crypto-des3.c
--- a/drivers/crypto/ccp/ccp-crypto-main.c
+++ b/drivers/crypto/ccp/ccp-crypto-main.c
--- a/drivers/crypto/ccp/ccp-crypto-sha.c
+++ b/drivers/crypto/ccp/ccp-crypto-sha.c
--- a/drivers/crypto/ccp/ccp-crypto.h
+++ b/drivers/crypto/ccp/ccp-crypto.h
--- a/drivers/crypto/ccp/ccp-dev-v3.c
+++ b/drivers/crypto/ccp/ccp-dev-v3.c
--- a/drivers/crypto/ccp/ccp-dev-v5.c
+++ b/drivers/crypto/ccp/ccp-dev-v5.c
--- a/drivers/crypto/ccp/ccp-dev.h
+++ b/drivers/crypto/ccp/ccp-dev.h
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
--- a/drivers/crypto/ccp/ccp-pci.c
+++ b/drivers/crypto/ccp/ccp-pci.c
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
--- a/drivers/crypto/chelsio/chcr_algo.h
+++ b/drivers/crypto/chelsio/chcr_algo.h
--- a/drivers/crypto/chelsio/chcr_core.h
+++ b/drivers/crypto/chelsio/chcr_core.h
--- a/drivers/crypto/chelsio/chcr_crypto.h
+++ b/drivers/crypto/chelsio/chcr_crypto.h
--- a/drivers/crypto/exynos-rng.c
+++ b/drivers/crypto/exynos-rng.c
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
--- a/drivers/crypto/mediatek/mtk-aes.c
+++ b/drivers/crypto/mediatek/mtk-aes.c
--- a/drivers/crypto/mediatek/mtk-platform.c
+++ b/drivers/crypto/mediatek/mtk-platform.c
--- a/drivers/crypto/mediatek/mtk-platform.h
+++ b/drivers/crypto/mediatek/mtk-platform.h
--- a/drivers/crypto/mediatek/mtk-sha.c
+++ b/drivers/crypto/mediatek/mtk-sha.c
--- a/drivers/crypto/qat/qat_common/qat_asym_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_asym_algs.c
--- a/drivers/crypto/s5p-sss.c
+++ b/drivers/crypto/s5p-sss.c
--- a/drivers/crypto/stm32/Kconfig
+++ b/drivers/crypto/stm32/Kconfig
--- a/drivers/crypto/stm32/Makefile
+++ b/drivers/crypto/stm32/Makefile
--- a/drivers/crypto/stm32/stm32_crc32.c
+++ b/drivers/crypto/stm32/stm32_crc32.c
--- a/drivers/gpu/drm/udl/udl_fb.c
+++ b/drivers/gpu/drm/udl/udl_fb.c
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
--- a/drivers/soc/fsl/qbman/qman.c
+++ b/drivers/soc/fsl/qbman/qman.c
--- a/drivers/soc/fsl/qbman/qman_ccsr.c
+++ b/drivers/soc/fsl/qbman/qman_ccsr.c
--- a/drivers/soc/fsl/qbman/qman_priv.h
+++ b/drivers/soc/fsl/qbman/qman_priv.h
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
--- a/include/crypto/internal/scompress.h
+++ b/include/crypto/internal/scompress.h
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
--- a/include/crypto/xts.h
+++ b/include/crypto/xts.h
--- a/include/dt-bindings/clock/gxbb-clkc.h
+++ b/include/dt-bindings/clock/gxbb-clkc.h
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
--- a/include/soc/fsl/qman.h
+++ b/include/soc/fsl/qman.h
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
--- a/include/video/udlfb.h
+++ b/include/video/udlfb.h
--- a/kernel/padata.c
+++ b/kernel/padata.c
--- a/lib/Makefile
+++ b/lib/Makefile
--- a/lib/md5.c
+++ b/lib/md5.c
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c