Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu: "Here is the crypto update for 4.2: API: - Convert RNG interface to new style. - New AEAD interface with one SG list for AD and plain/cipher text. All external AEAD users have been converted. - New asymmetric key interface (akcipher). Algorithms: - Chacha20, Poly1305 and RFC7539 support. - New RSA implementation. - Jitter RNG. - DRBG is now seeded with both /dev/random and Jitter RNG. If kernel pool isn't ready then DRBG will be reseeded when it is. - DRBG is now the default crypto API RNG, replacing krng. - 842 compression (previously part of powerpc nx driver). Drivers: - Accelerated SHA-512 for arm64. - New Marvell CESA driver that supports DMA and more algorithms. - Updated powerpc nx 842 support. - Added support for SEC1 hardware to talitos" * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (292 commits) crypto: marvell/cesa - remove COMPILE_TEST dependency crypto: algif_aead - Temporarily disable all AEAD algorithms crypto: af_alg - Forbid the use internal algorithms crypto: echainiv - Only hold RNG during initialisation crypto: seqiv - Add compatibility support without RNG crypto: eseqiv - Offer normal cipher functionality without RNG crypto: chainiv - Offer normal cipher functionality without RNG crypto: user - Add CRYPTO_MSG_DELRNG crypto: user - Move cryptouser.h to uapi crypto: rng - Do not free default RNG when it becomes unused crypto: skcipher - Allow givencrypt to be NULL crypto: sahara - propagate the error on clk_disable_unprepare() failure crypto: rsa - fix invalid select for AKCIPHER crypto: picoxcell - Update to the current clk API crypto: nx - Check for bogus firmware properties crypto: marvell/cesa - add DT bindings documentation crypto: marvell/cesa - add support for Kirkwood and Dove SoCs crypto: marvell/cesa - add support for Orion SoCs crypto: marvell/cesa - add allhwsupport module parameter crypto: marvell/cesa - add support for all armada SoCs ...

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu: "Here is the crypto update for 4.2: API: - Convert RNG interface to new style. - New AEAD interface with one SG list for AD and plain/cipher text. All external AEAD users have been converted. - New asymmetric key interface (akcipher). Algorithms: - Chacha20, Poly1305 and RFC7539 support. - New RSA implementation. - Jitter RNG. - DRBG is now seeded with both /dev/random and Jitter RNG. If kernel pool isn't ready then DRBG will be reseeded when it is. - DRBG is now the default crypto API RNG, replacing krng. - 842 compression (previously part of powerpc nx driver). Drivers: - Accelerated SHA-512 for arm64. - New Marvell CESA driver that supports DMA and more algorithms. - Updated powerpc nx 842 support. - Added support for SEC1 hardware to talitos" * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (292 commits) crypto: marvell/cesa - remove COMPILE_TEST dependency crypto: algif_aead - Temporarily disable all AEAD algorithms crypto: af_alg - Forbid the use internal algorithms crypto: echainiv - Only hold RNG during initialisation crypto: seqiv - Add compatibility support without RNG crypto: eseqiv - Offer normal cipher functionality without RNG crypto: chainiv - Offer normal cipher functionality without RNG crypto: user - Add CRYPTO_MSG_DELRNG crypto: user - Move cryptouser.h to uapi crypto: rng - Do not free default RNG when it becomes unused crypto: skcipher - Allow givencrypt to be NULL crypto: sahara - propagate the error on clk_disable_unprepare() failure crypto: rsa - fix invalid select for AKCIPHER crypto: picoxcell - Update to the current clk API crypto: nx - Check for bogus firmware properties crypto: marvell/cesa - add DT bindings documentation crypto: marvell/cesa - add support for Kirkwood and Dove SoCs crypto: marvell/cesa - add support for Orion SoCs crypto: marvell/cesa - add allhwsupport module parameter crypto: marvell/cesa - add support for all armada SoCs ...
44d21c3f · Linus Torvalds · efdfce2b · fe55dfdc · 44d21c3f · 44d21c3f
174 changed file
--- a/Documentation/DocBook/crypto-API.tmpl
+++ b/Documentation/DocBook/crypto-API.tmpl
@@ -119,7 +119,7 @@

    <para>
     Note: The terms "transformation" and cipher algorithm are used
-     interchangably.
+     interchangeably.
    </para>
   </sect1>

@@ -536,8 +536,8 @@

     <para>
      For other use cases of AEAD ciphers, the ASCII art applies as
-      well, but the caller may not use the GIVCIPHER interface. In
-      this case, the caller must generate the IV.
+      well, but the caller may not use the AEAD cipher with a separate
+      IV generator. In this case, the caller must generate the IV.
     </para>

     <para>
@@ -584,8 +584,8 @@ kernel crypto API                                |   IPSEC Layer
                                                 |
 +-----------+                                    |
 |           |            (1)
-| givcipher | <-----------------------------------  esp_output
-|  (seqiv)  | ---+
+|   aead    | <-----------------------------------  esp_output
+| (seqniv)  | ---+
 +-----------+    |
                 | (2)
 +-----------+    |
@@ -620,8 +620,8 @@ kernel crypto API                                |   IPSEC Layer
     <orderedlist>
      <listitem>
       <para>
-        esp_output() invokes crypto_aead_givencrypt() to trigger an encryption
-        operation of the GIVCIPHER implementation.
+        esp_output() invokes crypto_aead_encrypt() to trigger an encryption
+        operation of the AEAD cipher with IV generator.
       </para>

       <para>
@@ -1563,7 +1563,7 @@ struct sockaddr_alg sa = {

   <sect1><title>Zero-Copy Interface</title>
    <para>
-     In addition to the send/write/read/recv system call familty, the AF_ALG
+     In addition to the send/write/read/recv system call family, the AF_ALG
     interface can be accessed with the zero-copy interface of splice/vmsplice.
     As the name indicates, the kernel tries to avoid a copy operation into
     kernel space.
@@ -1669,9 +1669,19 @@ read(opfd, out, outlen);
  </chapter>

  <chapter id="API"><title>Programming Interface</title>
+   <para>
+    Please note that the kernel crypto API contains the AEAD givcrypt
+    API (crypto_aead_giv* and aead_givcrypt_* function calls in
+    include/crypto/aead.h). This API is obsolete and will be removed
+    in the future. To obtain the functionality of an AEAD cipher with
+    internal IV generation, use the IV generator as a regular cipher.
+    For example, rfc4106(gcm(aes)) is the AEAD cipher with external
+    IV generation and seqniv(rfc4106(gcm(aes))) implies that the kernel
+    crypto API generates the IV. Different IV generators are available.
+   </para>
   <sect1><title>Block Cipher Context Data Structures</title>
 !Pinclude/linux/crypto.h Block Cipher Context Data Structures
-!Finclude/linux/crypto.h aead_request
+!Finclude/crypto/aead.h aead_request
   </sect1>
   <sect1><title>Block Cipher Algorithm Definitions</title>
 !Pinclude/linux/crypto.h Block Cipher Algorithm Definitions
@@ -1680,7 +1690,7 @@ read(opfd, out, outlen);
 !Finclude/linux/crypto.h aead_alg
 !Finclude/linux/crypto.h blkcipher_alg
 !Finclude/linux/crypto.h cipher_alg
-!Finclude/linux/crypto.h rng_alg
+!Finclude/crypto/rng.h rng_alg
   </sect1>
   <sect1><title>Asynchronous Block Cipher API</title>
 !Pinclude/linux/crypto.h Asynchronous Block Cipher API
@@ -1704,26 +1714,27 @@ read(opfd, out, outlen);
 !Finclude/linux/crypto.h ablkcipher_request_set_crypt
   </sect1>
   <sect1><title>Authenticated Encryption With Associated Data (AEAD) Cipher API</title>
-!Pinclude/linux/crypto.h Authenticated Encryption With Associated Data (AEAD) Cipher API
-!Finclude/linux/crypto.h crypto_alloc_aead
-!Finclude/linux/crypto.h crypto_free_aead
-!Finclude/linux/crypto.h crypto_aead_ivsize
-!Finclude/linux/crypto.h crypto_aead_authsize
-!Finclude/linux/crypto.h crypto_aead_blocksize
-!Finclude/linux/crypto.h crypto_aead_setkey
-!Finclude/linux/crypto.h crypto_aead_setauthsize
-!Finclude/linux/crypto.h crypto_aead_encrypt
-!Finclude/linux/crypto.h crypto_aead_decrypt
+!Pinclude/crypto/aead.h Authenticated Encryption With Associated Data (AEAD) Cipher API
+!Finclude/crypto/aead.h crypto_alloc_aead
+!Finclude/crypto/aead.h crypto_free_aead
+!Finclude/crypto/aead.h crypto_aead_ivsize
+!Finclude/crypto/aead.h crypto_aead_authsize
+!Finclude/crypto/aead.h crypto_aead_blocksize
+!Finclude/crypto/aead.h crypto_aead_setkey
+!Finclude/crypto/aead.h crypto_aead_setauthsize
+!Finclude/crypto/aead.h crypto_aead_encrypt
+!Finclude/crypto/aead.h crypto_aead_decrypt
   </sect1>
   <sect1><title>Asynchronous AEAD Request Handle</title>
-!Pinclude/linux/crypto.h Asynchronous AEAD Request Handle
-!Finclude/linux/crypto.h crypto_aead_reqsize
-!Finclude/linux/crypto.h aead_request_set_tfm
-!Finclude/linux/crypto.h aead_request_alloc
-!Finclude/linux/crypto.h aead_request_free
-!Finclude/linux/crypto.h aead_request_set_callback
-!Finclude/linux/crypto.h aead_request_set_crypt
-!Finclude/linux/crypto.h aead_request_set_assoc
+!Pinclude/crypto/aead.h Asynchronous AEAD Request Handle
+!Finclude/crypto/aead.h crypto_aead_reqsize
+!Finclude/crypto/aead.h aead_request_set_tfm
+!Finclude/crypto/aead.h aead_request_alloc
+!Finclude/crypto/aead.h aead_request_free
+!Finclude/crypto/aead.h aead_request_set_callback
+!Finclude/crypto/aead.h aead_request_set_crypt
+!Finclude/crypto/aead.h aead_request_set_assoc
+!Finclude/crypto/aead.h aead_request_set_ad
   </sect1>
   <sect1><title>Synchronous Block Cipher API</title>
 !Pinclude/linux/crypto.h Synchronous Block Cipher API

--- a/Documentation/devicetree/bindings/crypto/fsl-sec2.txt
+++ b/Documentation/devicetree/bindings/crypto/fsl-sec2.txt
-Freescale SoC SEC Security Engines versions 2.x-3.x
+Freescale SoC SEC Security Engines versions 1.x-2.x-3.x

 Required properties:

 - compatible : Should contain entries for this and backward compatible
-  SEC versions, high to low, e.g., "fsl,sec2.1", "fsl,sec2.0"
+  SEC versions, high to low, e.g., "fsl,sec2.1", "fsl,sec2.0" (SEC2/3)
+                             e.g., "fsl,sec1.2", "fsl,sec1.0" (SEC1)
+    warning: SEC1 and SEC2 are mutually exclusive
 - reg : Offset and length of the register set for the device
 - interrupts : the SEC's interrupt number
 - fsl,num-channels : An integer representing the number of channels

--- a/Documentation/devicetree/bindings/crypto/marvell-cesa.txt
+++ b/Documentation/devicetree/bindings/crypto/marvell-cesa.txt
+Marvell Cryptographic Engines And Security Accelerator
+
+Required properties:
+- compatible: should be one of the following string
+	      "marvell,orion-crypto"
+	      "marvell,kirkwood-crypto"
+	      "marvell,dove-crypto"
+	      "marvell,armada-370-crypto"
+	      "marvell,armada-xp-crypto"
+	      "marvell,armada-375-crypto"
+	      "marvell,armada-38x-crypto"
+- reg: base physical address of the engine and length of memory mapped
+       region. Can also contain an entry for the SRAM attached to the CESA,
+       but this representation is deprecated and marvell,crypto-srams should
+       be used instead
+- reg-names: "regs". Can contain an "sram" entry, but this representation
+	     is deprecated and marvell,crypto-srams should be used instead
+- interrupts: interrupt number
+- clocks: reference to the crypto engines clocks. This property is not
+	  required for orion and kirkwood platforms
+- clock-names: "cesaX" and "cesazX", X should be replaced by the crypto engine
+	       id.
+	       This property is not required for the orion and kirkwoord
+	       platforms.
+	       "cesazX" clocks are not required on armada-370 platforms
+- marvell,crypto-srams: phandle to crypto SRAM definitions
+
+Optional properties:
+- marvell,crypto-sram-size: SRAM size reserved for crypto operations, if not
+			    specified the whole SRAM is used (2KB)
+
+
+Examples:
+
+	crypto@90000 {
+		compatible = "marvell,armada-xp-crypto";
+		reg = <0x90000 0x10000>;
+		reg-names = "regs";
+		interrupts = <48>, <49>;
+		clocks = <&gateclk 23>, <&gateclk 23>;
+		clock-names = "cesa0", "cesa1";
+		marvell,crypto-srams = <&crypto_sram0>, <&crypto_sram1>;
+		marvell,crypto-sram-size = <0x600>;
+		status = "okay";
+	};
--- a/Documentation/devicetree/bindings/crypto/mv_cesa.txt
+++ b/Documentation/devicetree/bindings/crypto/mv_cesa.txt
 Marvell Cryptographic Engines And Security Accelerator

 Required properties:
- compatible : should be "marvell,orion-crypto"
- reg : base physical address of the engine and length of memory mapped
-        region, followed by base physical address of sram and its memory
-        length
- reg-names : "regs" , "sram";
- interrupts : interrupt number
+- compatible: should be one of the following string
+	      "marvell,orion-crypto"
+	      "marvell,kirkwood-crypto"
+	      "marvell,dove-crypto"
+- reg: base physical address of the engine and length of memory mapped
+       region. Can also contain an entry for the SRAM attached to the CESA,
+       but this representation is deprecated and marvell,crypto-srams should
+       be used instead
+- reg-names: "regs". Can contain an "sram" entry, but this representation
+	     is deprecated and marvell,crypto-srams should be used instead
+- interrupts: interrupt number
+- clocks: reference to the crypto engines clocks. This property is only
+	  required for Dove platforms
+- marvell,crypto-srams: phandle to crypto SRAM definitions
+
+Optional properties:
+- marvell,crypto-sram-size: SRAM size reserved for crypto operations, if not
+			    specified the whole SRAM is used (2KB)

 Examples:

 	crypto@30000 {
 		compatible = "marvell,orion-crypto";
-		reg = <0x30000 0x10000>,
-		      <0x4000000 0x800>;
-		reg-names = "regs" , "sram";
+		reg = <0x30000 0x10000>;
+		reg-names = "regs";
 		interrupts = <22>;
+		marvell,crypto-srams = <&crypto_sram>;
+		marvell,crypto-sram-size = <0x600>;
 		status = "okay";
 	};
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4879,13 +4879,23 @@ M:	Marcelo Henrique Cerri <mhcerri@linux.vnet.ibm.com>
 M:	Fionnuala Gunter <fin@linux.vnet.ibm.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
-F:	drivers/crypto/nx/
+F:	drivers/crypto/nx/Makefile
+F:	drivers/crypto/nx/Kconfig
+F:	drivers/crypto/nx/nx-aes*
+F:	drivers/crypto/nx/nx-sha*
+F:	drivers/crypto/nx/nx.*
+F:	drivers/crypto/nx/nx_csbcpb.h
+F:	drivers/crypto/nx/nx_debugfs.h

 IBM Power 842 compression accelerator
 M:	Dan Streetman <ddstreet@us.ibm.com>
 S:	Supported
-F:	drivers/crypto/nx/nx-842.c
-F:	include/linux/nx842.h
+F:	drivers/crypto/nx/Makefile
+F:	drivers/crypto/nx/Kconfig
+F:	drivers/crypto/nx/nx-842*
+F:	include/linux/sw842.h
+F:	crypto/842.c
+F:	lib/842/

 IBM Power Linux RAID adapter
 M:	Brian King <brking@us.ibm.com>

--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -53,20 +53,13 @@ config CRYPTO_SHA256_ARM
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using optimized ARM assembler and NEON, when available.

-config CRYPTO_SHA512_ARM_NEON
-	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_SHA512
+config CRYPTO_SHA512_ARM
+	tristate "SHA-384/512 digest algorithm (ARM-asm and NEON)"
 	select CRYPTO_HASH
+	depends on !CPU_V7M
 	help
 	  SHA-512 secure hash standard (DFIPS 180-2) implemented
-	  using ARM NEON instructions, when available.
-
-	  This version of SHA implements a 512 bit hash with 256 bits of
-	  security against collision attacks.
-
-	  This code also includes SHA-384, a 384 bit hash with 192 bits
-	  of security against collision attacks.
+	  using optimized ARM assembler and NEON, when available.

 config CRYPTO_AES_ARM
 	tristate "AES cipher algorithms (ARM-asm)"

--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o

 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -30,7 +30,8 @@ sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
 sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
 sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
-sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
+sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
+sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
@@ -45,4 +46,7 @@ $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
 	$(call cmd,perl)

-.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
+$(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
+	$(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S $(obj)/sha512-core.S
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -101,15 +101,14 @@
 	\dround		q10, q11
 	blo		0f			@ AES-128: 10 rounds
 	vld1.8		{q10-q11}, [ip]!
-	beq		1f			@ AES-192: 12 rounds
 	\dround		q12, q13
+	beq		1f			@ AES-192: 12 rounds
 	vld1.8		{q12-q13}, [ip]
 	\dround		q10, q11
 0:	\fround		q12, q13, q14
 	bx		lr

-1:	\dround		q12, q13
-	\fround		q10, q11, q14
+1:	\fround		q10, q11, q14
 	bx		lr
 	.endm

@@ -122,8 +121,8 @@
 	 *   q2        : third in/output block (_3x version only)
 	 *   q8        : first round key
 	 *   q9        : secound round key
-	 *   ip        : address of 3rd round key
 	 *   q14       : final round key
+	 *   r2        : address of round key array
 	 *   r3        : number of rounds
 	 */
 	.align		6

--- a/arch/arm/crypto/sha512-armv4.pl
+++ b/arch/arm/crypto/sha512-armv4.pl
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	# parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############	r13 is stack pointer
+$Ktbl="r14";
+############	r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
+	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
+	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
+	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
+	eor	$t0,$t0,$Elo,lsr#18
+	eor	$t1,$t1,$Ehi,lsr#18
+	eor	$t0,$t0,$Ehi,lsl#14
+	eor	$t1,$t1,$Elo,lsl#14
+	eor	$t0,$t0,$Ehi,lsr#9
+	eor	$t1,$t1,$Elo,lsr#9
+	eor	$t0,$t0,$Elo,lsl#23
+	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
+	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
+	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
+	ldr	$t3,[sp,#$Goff+4]	@ g.hi
+
+	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
+	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
+	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
+	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
+	eor	$t0,$t0,$t2
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
+
+	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
+	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
+	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
+	adc	$Thi,$Thi,$t3		@ T += K[i]
+	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
+	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	$Ktbl,$Ktbl,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	$t0,$Alo,lsr#28
+	mov	$t1,$Ahi,lsr#28
+	eor	$t0,$t0,$Ahi,lsl#4
+	eor	$t1,$t1,$Alo,lsl#4
+	eor	$t0,$t0,$Ahi,lsr#2
+	eor	$t1,$t1,$Alo,lsr#2
+	eor	$t0,$t0,$Alo,lsl#30
+	eor	$t1,$t1,$Ahi,lsl#30
+	eor	$t0,$t0,$Ahi,lsr#7
+	eor	$t1,$t1,$Alo,lsr#7
+	eor	$t0,$t0,$Alo,lsl#25
+	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
+	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
+	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
+
+	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
+	ldr	$t2,[sp,#$Coff+4]	@ c.hi
+	and	$Alo,$Alo,$t3
+	and	$t3,$Ahi,$t1
+	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
+	and	$Ahi,$Ahi,$t2
+	adds	$Alo,$Alo,$Tlo
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
+	add	$Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha512_block_data_order
+#else
+	adr	r3,sha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	sub	$Ktbl,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	$Elo,[$ctx,#$Eoff+$lo]
+	ldr	$Ehi,[$ctx,#$Eoff+$hi]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+.Loop:
+	str	$t0, [sp,#$Goff+0]
+	str	$t1, [sp,#$Goff+4]
+	str	$t2, [sp,#$Hoff+0]
+	str	$t3, [sp,#$Hoff+4]
+	ldr	$Alo,[$ctx,#$Aoff+$lo]
+	ldr	$Ahi,[$ctx,#$Aoff+$hi]
+	ldr	$Tlo,[$ctx,#$Boff+$lo]
+	ldr	$Thi,[$ctx,#$Boff+$hi]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	str	$Tlo,[sp,#$Boff+0]
+	str	$Thi,[sp,#$Boff+4]
+	str	$t0, [sp,#$Coff+0]
+	str	$t1, [sp,#$Coff+4]
+	str	$t2, [sp,#$Doff+0]
+	str	$t3, [sp,#$Doff+4]
+	ldr	$Tlo,[$ctx,#$Foff+$lo]
+	ldr	$Thi,[$ctx,#$Foff+$hi]
+	str	$Tlo,[sp,#$Foff+0]
+	str	$Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	$Tlo,[$inp,#7]
+	ldrb	$t0, [$inp,#6]
+	ldrb	$t1, [$inp,#5]
+	ldrb	$t2, [$inp,#4]
+	ldrb	$Thi,[$inp,#3]
+	ldrb	$t3, [$inp,#2]
+	orr	$Tlo,$Tlo,$t0,lsl#8
+	ldrb	$t0, [$inp,#1]
+	orr	$Tlo,$Tlo,$t1,lsl#16
+	ldrb	$t1, [$inp],#8
+	orr	$Tlo,$Tlo,$t2,lsl#24
+	orr	$Thi,$Thi,$t3,lsl#8
+	orr	$Thi,$Thi,$t0,lsl#16
+	orr	$Thi,$Thi,$t1,lsl#24
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
+___
+	&BODY_00_15(0x94);
+$code.=<<___;
+	tst	$Ktbl,#1
+	beq	.L00_15
+	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
+	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
+	eor	$Tlo,$Tlo,$t1,lsl#31
+	eor	$Thi,$Thi,$t0,lsl#31
+	eor	$Tlo,$Tlo,$t0,lsr#8
+	eor	$Thi,$Thi,$t1,lsr#8
+	eor	$Tlo,$Tlo,$t1,lsl#24
+	eor	$Thi,$Thi,$t0,lsl#24
+	eor	$Tlo,$Tlo,$t0,lsr#7
+	eor	$Thi,$Thi,$t1,lsr#7
+	eor	$Tlo,$Tlo,$t1,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	$t0,$t2,lsr#19
+	mov	$t1,$t3,lsr#19
+	eor	$t0,$t0,$t3,lsl#13
+	eor	$t1,$t1,$t2,lsl#13
+	eor	$t0,$t0,$t3,lsr#29
+	eor	$t1,$t1,$t2,lsr#29
+	eor	$t0,$t0,$t2,lsl#3
+	eor	$t1,$t1,$t3,lsl#3
+	eor	$t0,$t0,$t2,lsr#6
+	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
+	eor	$t0,$t0,$t3,lsl#26
+
+	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
+	adc	$Thi,$Thi,$t1
+
+	ldr	$t1,[sp,#`$Xoff+8*16`+4]
+	adds	$Tlo,$Tlo,$t2
+	adc	$Thi,$Thi,$t3
+	adds	$Tlo,$Tlo,$t0
+	adc	$Thi,$Thi,$t1
+___
+	&BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	beq	.L16_79
+	bic	$Ktbl,$Ktbl,#1
+
+	ldr	$Tlo,[sp,#$Boff+0]
+	ldr	$Thi,[sp,#$Boff+4]
+	ldr	$t0, [$ctx,#$Aoff+$lo]
+	ldr	$t1, [$ctx,#$Aoff+$hi]
+	ldr	$t2, [$ctx,#$Boff+$lo]
+	ldr	$t3, [$ctx,#$Boff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Boff+$hi]
+
+	ldr	$Alo,[sp,#$Coff+0]
+	ldr	$Ahi,[sp,#$Coff+4]
+	ldr	$Tlo,[sp,#$Doff+0]
+	ldr	$Thi,[sp,#$Doff+4]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Doff+$hi]
+
+	ldr	$Tlo,[sp,#$Foff+0]
+	ldr	$Thi,[sp,#$Foff+4]
+	ldr	$t0, [$ctx,#$Eoff+$lo]
+	ldr	$t1, [$ctx,#$Eoff+$hi]
+	ldr	$t2, [$ctx,#$Foff+$lo]
+	ldr	$t3, [$ctx,#$Foff+$hi]
+	adds	$Elo,$Elo,$t0
+	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
+	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Foff+$hi]
+
+	ldr	$Alo,[sp,#$Goff+0]
+	ldr	$Ahi,[sp,#$Goff+4]
+	ldr	$Tlo,[sp,#$Hoff+0]
+	ldr	$Thi,[sp,#$Hoff+4]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Hoff+$hi]
+
+	add	sp,sp,#640
+	sub	$Ktbl,$Ktbl,#640
+
+	teq	$inp,$len
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+#if $i>0
+	 vadd.i64	$a,$Maj			@ h+=Maj from the past
+#endif
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vmov		$Ch,$e
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	veor		$t1,$t0
+	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	veor		$t2,$t1			@ Sigma1(e)
+	vadd.i64	$T1,$Ch,$h
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vadd.i64	$T1,$t2
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vadd.i64	$K,@X[$i%16]
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	veor		$Maj,$a,$b
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	veor		$h,$t0,$t1
+	vadd.i64	$T1,$K
+	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
+	veor		$h,$t2			@ Sigma0(a)
+	vadd.i64	$d,$T1
+	vadd.i64	$Maj,$T1
+	@ vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	 vadd.i64	@_[0],d30			@ h+=Maj from the past
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	VFP_ABI_PUSH
+	adrl	$Ktbl,K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	 vadd.i64	$A,d30		@ h+=Maj from the past
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
--- a/arch/arm/crypto/sha512-armv7-neon.S
+++ b/arch/arm/crypto/sha512-armv7-neon.S
-/* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
- *
- * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <linux/linkage.h>
-
-
-.syntax unified
-.code   32
-.fpu neon
-
-.text
-
-/* structure of SHA512_CONTEXT */
-#define hd_a 0
-#define hd_b ((hd_a) + 8)
-#define hd_c ((hd_b) + 8)
-#define hd_d ((hd_c) + 8)
-#define hd_e ((hd_d) + 8)
-#define hd_f ((hd_e) + 8)
-#define hd_g ((hd_f) + 8)
-
-/* register macros */
-#define RK %r2
-
-#define RA d0
-#define RB d1
-#define RC d2
-#define RD d3
-#define RE d4
-#define RF d5
-#define RG d6
-#define RH d7
-
-#define RT0 d8
-#define RT1 d9
-#define RT2 d10
-#define RT3 d11
-#define RT4 d12
-#define RT5 d13
-#define RT6 d14
-#define RT7 d15
-
-#define RT01q q4
-#define RT23q q5
-#define RT45q q6
-#define RT67q q7
-
-#define RW0 d16
-#define RW1 d17
-#define RW2 d18
-#define RW3 d19
-#define RW4 d20
-#define RW5 d21
-#define RW6 d22
-#define RW7 d23
-#define RW8 d24
-#define RW9 d25
-#define RW10 d26
-#define RW11 d27
-#define RW12 d28
-#define RW13 d29
-#define RW14 d30
-#define RW15 d31
-
-#define RW01q q8
-#define RW23q q9
-#define RW45q q10
-#define RW67q q11
-#define RW89q q12
-#define RW1011q q13
-#define RW1213q q14
-#define RW1415q q15
-
-/***********************************************************************
- * ARM assembly implementation of sha512 transform
- ***********************************************************************/
-#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
-                     rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
-	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
-	vshr.u64 RT2, re, #14; \
-	vshl.u64 RT3, re, #64 - 14; \
-	interleave_op(arg1); \
-	vshr.u64 RT4, re, #18; \
-	vshl.u64 RT5, re, #64 - 18; \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, re, #41; \
-	vshl.u64 RT5, re, #64 - 41; \
-	vadd.u64 RT0, RT0, rw0; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, re; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, rf, rg; \
-	\
-	vadd.u64 RT1, RT1, rh; \
-	vshr.u64 RT2, ra, #28; \
-	vshl.u64 RT3, ra, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, ra, #34; \
-	vshl.u64 RT5, ra, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* h = Sum0 (a) + Maj (a, b, c); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, ra, #39; \
-	vshl.u64 RT5, ra, #64 - 39; \
-	veor.64 RT0, ra, rb; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rc, rb; \
-	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
-	veor.64 rh, RT2, RT3; \
-	\
-	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
-	vshr.u64 RT2, rd, #14; \
-	vshl.u64 RT3, rd, #64 - 14; \
-	vadd.u64 rh, rh, RT0; \
-	vshr.u64 RT4, rd, #18; \
-	vshl.u64 RT5, rd, #64 - 18; \
-	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rd, #41; \
-	vshl.u64 RT5, rd, #64 - 41; \
-	vadd.u64 RT0, RT0, rw1; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, rd; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, re, rf; \
-	\
-	vadd.u64 RT1, RT1, rg; \
-	vshr.u64 RT2, rh, #28; \
-	vshl.u64 RT3, rh, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, rh, #34; \
-	vshl.u64 RT5, rh, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* g = Sum0 (h) + Maj (h, a, b); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rh, #39; \
-	vshl.u64 RT5, rh, #64 - 39; \
-	veor.64 RT0, rh, ra; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rb, ra; \
-	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
-	veor.64 rg, RT2, RT3; \
-	\
-	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
-	/* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
-	\
-	/**** S0(w[1:2]) */ \
-	\
-	/* w[0:1] += w[9:10] */ \
-	/* RT23q = rw1:rw2 */ \
-	vext.u64 RT23q, rw01q, rw23q, #1; \
-	vadd.u64 rw0, rw9; \
-	vadd.u64 rg, rg, RT0; \
-	vadd.u64 rw1, rw10;\
-	vadd.u64 rg, rg, RT1; /* g+=t1; */ \
-	\
-	vshr.u64 RT45q, RT23q, #1; \
-	vshl.u64 RT67q, RT23q, #64 - 1; \
-	vshr.u64 RT01q, RT23q, #8; \
-	veor.u64 RT45q, RT45q, RT67q; \
-	vshl.u64 RT67q, RT23q, #64 - 8; \
-	veor.u64 RT45q, RT45q, RT01q; \
-	vshr.u64 RT01q, RT23q, #7; \
-	veor.u64 RT45q, RT45q, RT67q; \
-	\
-	/**** S1(w[14:15]) */ \
-	vshr.u64 RT23q, rw1415q, #6; \
-	veor.u64 RT01q, RT01q, RT45q; \
-	vshr.u64 RT45q, rw1415q, #19; \
-	vshl.u64 RT67q, rw1415q, #64 - 19; \
-	veor.u64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT45q, rw1415q, #61; \
-	veor.u64 RT23q, RT23q, RT67q; \
-	vshl.u64 RT67q, rw1415q, #64 - 61; \
-	veor.u64 RT23q, RT23q, RT45q; \
-	vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
-	veor.u64 RT01q, RT23q, RT67q;
-#define vadd_RT01q(rw01q) \
-	/* w[0:1] += S(w[14:15]) */ \
-	vadd.u64 rw01q, RT01q;
-
-#define dummy(_) /*_*/
-
-#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \
-	              interleave_op1, arg1, interleave_op2, arg2) \
-	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
-	vshr.u64 RT2, re, #14; \
-	vshl.u64 RT3, re, #64 - 14; \
-	interleave_op1(arg1); \
-	vshr.u64 RT4, re, #18; \
-	vshl.u64 RT5, re, #64 - 18; \
-	interleave_op2(arg2); \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, re, #41; \
-	vshl.u64 RT5, re, #64 - 41; \
-	vadd.u64 RT0, RT0, rw0; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, re; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, rf, rg; \
-	\
-	vadd.u64 RT1, RT1, rh; \
-	vshr.u64 RT2, ra, #28; \
-	vshl.u64 RT3, ra, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, ra, #34; \
-	vshl.u64 RT5, ra, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* h = Sum0 (a) + Maj (a, b, c); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, ra, #39; \
-	vshl.u64 RT5, ra, #64 - 39; \
-	veor.64 RT0, ra, rb; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rc, rb; \
-	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
-	veor.64 rh, RT2, RT3; \
-	\
-	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
-	vshr.u64 RT2, rd, #14; \
-	vshl.u64 RT3, rd, #64 - 14; \
-	vadd.u64 rh, rh, RT0; \
-	vshr.u64 RT4, rd, #18; \
-	vshl.u64 RT5, rd, #64 - 18; \
-	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rd, #41; \
-	vshl.u64 RT5, rd, #64 - 41; \
-	vadd.u64 RT0, RT0, rw1; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, rd; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, re, rf; \
-	\
-	vadd.u64 RT1, RT1, rg; \
-	vshr.u64 RT2, rh, #28; \
-	vshl.u64 RT3, rh, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, rh, #34; \
-	vshl.u64 RT5, rh, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* g = Sum0 (h) + Maj (h, a, b); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rh, #39; \
-	vshl.u64 RT5, rh, #64 - 39; \
-	veor.64 RT0, rh, ra; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rb, ra; \
-	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
-	veor.64 rg, RT2, RT3;
-#define vadd_rg_RT0(rg) \
-	vadd.u64 rg, rg, RT0;
-#define vadd_rg_RT1(rg) \
-	vadd.u64 rg, rg, RT1; /* g+=t1; */
-
-.align 3
-ENTRY(sha512_transform_neon)
-	/* Input:
-	 *	%r0: SHA512_CONTEXT
-	 *	%r1: data
-	 *	%r2: u64 k[] constants
-	 *	%r3: nblks
-	 */
-	push {%lr};
-
-	mov %lr, #0;
-
-	/* Load context to d0-d7 */
-	vld1.64 {RA-RD}, [%r0]!;
-	vld1.64 {RE-RH}, [%r0];
-	sub %r0, #(4*8);
-
-	/* Load input to w[16], d16-d31 */
-	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
-	vld1.64 {RW0-RW3}, [%r1]!;
-	vld1.64 {RW4-RW7}, [%r1]!;
-	vld1.64 {RW8-RW11}, [%r1]!;
-	vld1.64 {RW12-RW15}, [%r1]!;
-#ifdef __ARMEL__
-	/* byteswap */
-	vrev64.8 RW01q, RW01q;
-	vrev64.8 RW23q, RW23q;
-	vrev64.8 RW45q, RW45q;
-	vrev64.8 RW67q, RW67q;
-	vrev64.8 RW89q, RW89q;
-	vrev64.8 RW1011q, RW1011q;
-	vrev64.8 RW1213q, RW1213q;
-	vrev64.8 RW1415q, RW1415q;
-#endif
-
-	/* EABI says that d8-d15 must be preserved by callee. */
-	/*vpush {RT0-RT7};*/
-
-.Loop:
-	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
-		     RW23q, RW1415q, RW9, RW10, dummy, _);
-	b .Lenter_rounds;
-
-.Loop_rounds:
-	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
-		     RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
-.Lenter_rounds:
-	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4,
-		     RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
-	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6,
-		     RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
-	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8,
-		     RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
-	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10,
-		     RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
-	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12,
-		     RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
-	add %lr, #16;
-	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14,
-		     RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
-	cmp %lr, #64;
-	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0,
-		     RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
-	bne .Loop_rounds;
-
-	subs %r3, #1;
-
-	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1,
-		      vadd_RT01q, RW1415q, dummy, _);
-	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3,
-		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
-	beq .Lhandle_tail;
-	vld1.64 {RW0-RW3}, [%r1]!;
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-#ifdef __ARMEL__
-	vrev64.8 RW01q, RW01q;
-	vrev64.8 RW23q, RW23q;
-#endif
-	vld1.64 {RW4-RW7}, [%r1]!;
-	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
-		      vadd_rg_RT0, RA, vadd_rg_RT1, RA);
-	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
-		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
-#ifdef __ARMEL__
-	vrev64.8 RW45q, RW45q;
-	vrev64.8 RW67q, RW67q;
-#endif
-	vld1.64 {RW8-RW11}, [%r1]!;
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-#ifdef __ARMEL__
-	vrev64.8 RW89q, RW89q;
-	vrev64.8 RW1011q, RW1011q;
-#endif
-	vld1.64 {RW12-RW15}, [%r1]!;
-	vadd_rg_RT0(RA);
-	vadd_rg_RT1(RA);
-
-	/* Load context */
-	vld1.64 {RT0-RT3}, [%r0]!;
-	vld1.64 {RT4-RT7}, [%r0];
-	sub %r0, #(4*8);
-
-#ifdef __ARMEL__
-	vrev64.8 RW1213q, RW1213q;
-	vrev64.8 RW1415q, RW1415q;
-#endif
-
-	vadd.u64 RA, RT0;
-	vadd.u64 RB, RT1;
-	vadd.u64 RC, RT2;
-	vadd.u64 RD, RT3;
-	vadd.u64 RE, RT4;
-	vadd.u64 RF, RT5;
-	vadd.u64 RG, RT6;
-	vadd.u64 RH, RT7;
-
-	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
-	sub RK, $(8*80);
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	mov %lr, #0;
-	sub %r0, #(4*8);
-
-	b .Loop;
-
-.Lhandle_tail:
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
-		      vadd_rg_RT0, RA, vadd_rg_RT1, RA);
-	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
-		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-
-	/* Load context to d16-d23 */
-	vld1.64 {RW0-RW3}, [%r0]!;
-	vadd_rg_RT0(RA);
-	vld1.64 {RW4-RW7}, [%r0];
-	vadd_rg_RT1(RA);
-	sub %r0, #(4*8);
-
-	vadd.u64 RA, RW0;
-	vadd.u64 RB, RW1;
-	vadd.u64 RC, RW2;
-	vadd.u64 RD, RW3;
-	vadd.u64 RE, RW4;
-	vadd.u64 RF, RW5;
-	vadd.u64 RG, RW6;
-	vadd.u64 RH, RW7;
-
-	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
-
-	/* Clear used registers */
-	/* d16-d31 */
-	veor.u64 RW01q, RW01q;
-	veor.u64 RW23q, RW23q;
-	veor.u64 RW45q, RW45q;
-	veor.u64 RW67q, RW67q;
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	veor.u64 RW89q, RW89q;
-	veor.u64 RW1011q, RW1011q;
-	veor.u64 RW1213q, RW1213q;
-	veor.u64 RW1415q, RW1415q;
-	/* d8-d15 */
-	/*vpop {RT0-RT7};*/
-	/* d0-d7 (q0-q3) */
-	veor.u64 %q0, %q0;
-	veor.u64 %q1, %q1;
-	veor.u64 %q2, %q2;
-	veor.u64 %q3, %q3;
-
-	pop {%pc};
-ENDPROC(sha512_transform_neon)
--- a/arch/arm/crypto/sha512-core.S_shipped
+++ b/arch/arm/crypto/sha512-core.S_shipped
--- a/arch/arm/crypto/sha512-glue.c
+++ b/arch/arm/crypto/sha512-glue.c
+/*
+ * sha512-glue.c - accelerated SHA-384/512 for ARM
+ *
+ * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+#include "sha512.h"
+
+MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha384-arm");
+MODULE_ALIAS_CRYPTO("sha512-arm");
+
+asmlinkage void sha512_block_data_order(u64 *state, u8 const *src, int blocks);
+
+int sha512_arm_update(struct shash_desc *desc, const u8 *data,
+		      unsigned int len)
+{
+	return sha512_base_do_update(desc, data, len,
+		(sha512_block_fn *)sha512_block_data_order);
+}
+
+int sha512_arm_final(struct shash_desc *desc, u8 *out)
+{
+	sha512_base_do_finalize(desc,
+		(sha512_block_fn *)sha512_block_data_order);
+	return sha512_base_finish(desc, out);
+}
+
+int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
+		     unsigned int len, u8 *out)
+{
+	sha512_base_do_update(desc, data, len,
+		(sha512_block_fn *)sha512_block_data_order);
+	return sha512_arm_final(desc, out);
+}
+
+static struct shash_alg sha512_arm_algs[] = { {
+	.init			= sha384_base_init,
+	.update			= sha512_arm_update,
+	.final			= sha512_arm_final,
+	.finup			= sha512_arm_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA384_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha384",
+		.cra_driver_name	= "sha384-arm",
+		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+},  {
+	.init			= sha512_base_init,
+	.update			= sha512_arm_update,
+	.final			= sha512_arm_final,
+	.finup			= sha512_arm_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA512_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha512",
+		.cra_driver_name	= "sha512-arm",
+		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+} };
+
+static int __init sha512_arm_mod_init(void)
+{
+	int err;
+
+	err = crypto_register_shashes(sha512_arm_algs,
+				      ARRAY_SIZE(sha512_arm_algs));
+	if (err)
+		return err;
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+		err = crypto_register_shashes(sha512_neon_algs,
+					      ARRAY_SIZE(sha512_neon_algs));
+		if (err)
+			goto err_unregister;
+	}
+	return 0;
+
+err_unregister:
+	crypto_unregister_shashes(sha512_arm_algs,
+				  ARRAY_SIZE(sha512_arm_algs));
+
+	return err;
+}
+
+static void __exit sha512_arm_mod_fini(void)
+{
+	crypto_unregister_shashes(sha512_arm_algs,
+				  ARRAY_SIZE(sha512_arm_algs));
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+		crypto_unregister_shashes(sha512_neon_algs,
+					  ARRAY_SIZE(sha512_neon_algs));
+}
+
+module_init(sha512_arm_mod_init);
+module_exit(sha512_arm_mod_fini);
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ b/arch/arm/crypto/sha512-neon-glue.c
+/*
+ * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON
+ *
+ * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+#include "sha512.h"
+
+MODULE_ALIAS_CRYPTO("sha384-neon");
+MODULE_ALIAS_CRYPTO("sha512-neon");
+
+asmlinkage void sha512_block_data_order_neon(u64 *state, u8 const *src,
+					     int blocks);
+
+static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	if (!may_use_simd() ||
+	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
+		return sha512_arm_update(desc, data, len);
+
+	kernel_neon_begin();
+	sha512_base_do_update(desc, data, len,
+		(sha512_block_fn *)sha512_block_data_order_neon);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *out)
+{
+	if (!may_use_simd())
+		return sha512_arm_finup(desc, data, len, out);
+
+	kernel_neon_begin();
+	if (len)
+		sha512_base_do_update(desc, data, len,
+			(sha512_block_fn *)sha512_block_data_order_neon);
+	sha512_base_do_finalize(desc,
+		(sha512_block_fn *)sha512_block_data_order_neon);
+	kernel_neon_end();
+
+	return sha512_base_finish(desc, out);
+}
+
+static int sha512_neon_final(struct shash_desc *desc, u8 *out)
+{
+	return sha512_neon_finup(desc, NULL, 0, out);
+}
+
+struct shash_alg sha512_neon_algs[] = { {
+	.init			= sha384_base_init,
+	.update			= sha512_neon_update,
+	.final			= sha512_neon_final,
+	.finup			= sha512_neon_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA384_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha384",
+		.cra_driver_name	= "sha384-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA384_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+
+	}
+},  {
+	.init			= sha512_base_init,
+	.update			= sha512_neon_update,
+	.final			= sha512_neon_final,
+	.finup			= sha512_neon_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA512_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha512",
+		.cra_driver_name	= "sha512-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+} };
--- a/arch/arm/crypto/sha512.h
+++ b/arch/arm/crypto/sha512.h
+
+int sha512_arm_update(struct shash_desc *desc, const u8 *data,
+		      unsigned int len);
+
+int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
+		     unsigned int len, u8 *out);
+
+extern struct shash_alg sha512_neon_algs[2];
--- a/arch/arm/crypto/sha512_neon_glue.c
+++ b/arch/arm/crypto/sha512_neon_glue.c
-/*
- * Glue code for the SHA512 Secure Hash Algorithm assembly implementation
- * using NEON instructions.
- *
- * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is based on sha512_ssse3_glue.c:
- *   Copyright (C) 2013 Intel Corporation
- *   Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <crypto/sha.h>
-#include <asm/byteorder.h>
-#include <asm/simd.h>
-#include <asm/neon.h>
-
-
-static const u64 sha512_k[] = {
-	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
-	0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
-	0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
-	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
-	0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
-	0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
-	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
-	0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
-	0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
-	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
-	0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
-	0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
-	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
-	0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
-	0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
-	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
-	0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
-	0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
-	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
-	0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
-	0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
-	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
-	0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
-	0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
-	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
-	0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
-	0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
-	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
-	0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
-	0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
-	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
-	0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
-	0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
-	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
-	0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
-	0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
-	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
-	0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
-	0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
-	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
-};
-
-
-asmlinkage void sha512_transform_neon(u64 *digest, const void *data,
-				      const u64 k[], unsigned int num_blks);
-
-
-static int sha512_neon_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA512_H0;
-	sctx->state[1] = SHA512_H1;
-	sctx->state[2] = SHA512_H2;
-	sctx->state[3] = SHA512_H3;
-	sctx->state[4] = SHA512_H4;
-	sctx->state[5] = SHA512_H5;
-	sctx->state[6] = SHA512_H6;
-	sctx->state[7] = SHA512_H7;
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static int __sha512_neon_update(struct shash_desc *desc, const u8 *data,
-				unsigned int len, unsigned int partial)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count[0] += len;
-	if (sctx->count[0] < len)
-		sctx->count[1]++;
-
-	if (partial) {
-		done = SHA512_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha512_transform_neon(sctx->state, sctx->buf, sha512_k, 1);
-	}
-
-	if (len - done >= SHA512_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
-
-		sha512_transform_neon(sctx->state, data + done, sha512_k,
-				      rounds);
-
-		done += rounds * SHA512_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buf, data + done, len - done);
-
-	return 0;
-}
-
-static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
-	int res;
-
-	/* Handle the fast case right here */
-	if (partial + len < SHA512_BLOCK_SIZE) {
-		sctx->count[0] += len;
-		if (sctx->count[0] < len)
-			sctx->count[1]++;
-		memcpy(sctx->buf + partial, data, len);
-
-		return 0;
-	}
-
-	if (!may_use_simd()) {
-		res = crypto_sha512_update(desc, data, len);
-	} else {
-		kernel_neon_begin();
-		res = __sha512_neon_update(desc, data, len, partial);
-		kernel_neon_end();
-	}
-
-	return res;
-}
-
-
-/* Add padding and return the message digest. */
-static int sha512_neon_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be64 *dst = (__be64 *)out;
-	__be64 bits[2];
-	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
-
-	/* save number of bits */
-	bits[1] = cpu_to_be64(sctx->count[0] << 3);
-	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
-
-	/* Pad out to 112 mod 128 and append length */
-	index = sctx->count[0] & 0x7f;
-	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
-
-	if (!may_use_simd()) {
-		crypto_sha512_update(desc, padding, padlen);
-		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_neon_begin();
-		/* We need to fill a whole block for __sha512_neon_update() */
-		if (padlen <= 112) {
-			sctx->count[0] += padlen;
-			if (sctx->count[0] < padlen)
-				sctx->count[1]++;
-			memcpy(sctx->buf + index, padding, padlen);
-		} else {
-			__sha512_neon_update(desc, padding, padlen, index);
-		}
-		__sha512_neon_update(desc, (const u8 *)&bits,
-					sizeof(bits), 112);
-		kernel_neon_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be64(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_neon_export(struct shash_desc *desc, void *out)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_neon_import(struct shash_desc *desc, const void *in)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha384_neon_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA384_H0;
-	sctx->state[1] = SHA384_H1;
-	sctx->state[2] = SHA384_H2;
-	sctx->state[3] = SHA384_H3;
-	sctx->state[4] = SHA384_H4;
-	sctx->state[5] = SHA384_H5;
-	sctx->state[6] = SHA384_H6;
-	sctx->state[7] = SHA384_H7;
-
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static int sha384_neon_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[SHA512_DIGEST_SIZE];
-
-	sha512_neon_final(desc, D);
-
-	memcpy(hash, D, SHA384_DIGEST_SIZE);
-	memzero_explicit(D, SHA512_DIGEST_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize	=	SHA512_DIGEST_SIZE,
-	.init		=	sha512_neon_init,
-	.update		=	sha512_neon_update,
-	.final		=	sha512_neon_final,
-	.export		=	sha512_neon_export,
-	.import		=	sha512_neon_import,
-	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
-	.base		=	{
-		.cra_name	=	"sha512",
-		.cra_driver_name =	"sha512-neon",
-		.cra_priority	=	250,
-		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
-		.cra_blocksize	=	SHA512_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-},  {
-	.digestsize	=	SHA384_DIGEST_SIZE,
-	.init		=	sha384_neon_init,
-	.update		=	sha512_neon_update,
-	.final		=	sha384_neon_final,
-	.export		=	sha512_neon_export,
-	.import		=	sha512_neon_import,
-	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
-	.base		=	{
-		.cra_name	=	"sha384",
-		.cra_driver_name =	"sha384-neon",
-		.cra_priority	=	250,
-		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
-		.cra_blocksize	=	SHA384_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int __init sha512_neon_mod_init(void)
-{
-	if (!cpu_has_neon())
-		return -ENODEV;
-
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_neon_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha512_neon_mod_init);
-module_exit(sha512_neon_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, NEON accelerated");
-
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha384");
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -13,7 +13,7 @@
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <crypto/scatterwalk.h>
-#include <linux/crypto.h>
+#include <crypto/internal/aead.h>
 #include <linux/module.h>

 #include "aes-ce-setkey.h"

--- a/arch/mips/cavium-octeon/crypto/octeon-md5.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-md5.c
@@ -69,10 +69,10 @@ static int octeon_md5_init(struct shash_desc *desc)
 {
 	struct md5_state *mctx = shash_desc_ctx(desc);

-	mctx->hash[0] = cpu_to_le32(0x67452301);
-	mctx->hash[1] = cpu_to_le32(0xefcdab89);
-	mctx->hash[2] = cpu_to_le32(0x98badcfe);
-	mctx->hash[3] = cpu_to_le32(0x10325476);
+	mctx->hash[0] = cpu_to_le32(MD5_H0);
+	mctx->hash[1] = cpu_to_le32(MD5_H1);
+	mctx->hash[2] = cpu_to_le32(MD5_H2);
+	mctx->hash[3] = cpu_to_le32(MD5_H3);
 	mctx->byte_count = 0;

 	return 0;

--- a/arch/nios2/kernel/time.c
+++ b/arch/nios2/kernel/time.c
@@ -8,6 +8,7 @@
 * for more details.
 */

+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
@@ -106,6 +107,7 @@ cycles_t get_cycles(void)
 {
 	return nios2_timer_read(&nios2_cs.cs);
 }
+EXPORT_SYMBOL(get_cycles);

 static void nios2_timer_start(struct nios2_timer *timer)
 {

--- a/arch/powerpc/crypto/md5-glue.c
+++ b/arch/powerpc/crypto/md5-glue.c
@@ -37,10 +37,10 @@ static int ppc_md5_init(struct shash_desc *desc)
 {
 	struct md5_state *sctx = shash_desc_ctx(desc);

-	sctx->hash[0] = 0x67452301;
-	sctx->hash[1] = 0xefcdab89;
-	sctx->hash[2] = 0x98badcfe;
-	sctx->hash[3] =	0x10325476;
+	sctx->hash[0] = MD5_H0;
+	sctx->hash[1] = MD5_H1;
+	sctx->hash[2] = MD5_H2;
+	sctx->hash[3] =	MD5_H3;
 	sctx->byte_count = 0;

 	return 0;

--- a/arch/powerpc/include/asm/icswx.h
+++ b/arch/powerpc/include/asm/icswx.h
+/*
+ * ICSWX api
+ *
+ * Copyright (C) 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This provides the Initiate Coprocessor Store Word Indexed (ICSWX)
+ * instruction.  This instruction is used to communicate with PowerPC
+ * coprocessors.  This also provides definitions of the structures used
+ * to communicate with the coprocessor.
+ *
+ * The RFC02130: Coprocessor Architecture document is the reference for
+ * everything in this file unless otherwise noted.
+ */
+#ifndef _ARCH_POWERPC_INCLUDE_ASM_ICSWX_H_
+#define _ARCH_POWERPC_INCLUDE_ASM_ICSWX_H_
+
+#include <asm/ppc-opcode.h> /* for PPC_ICSWX */
+
+/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */
+
+#define CCB_VALUE		(0x3fffffffffffffff)
+#define CCB_ADDRESS		(0xfffffffffffffff8)
+#define CCB_CM			(0x0000000000000007)
+#define CCB_CM0			(0x0000000000000004)
+#define CCB_CM12		(0x0000000000000003)
+
+#define CCB_CM0_ALL_COMPLETIONS	(0x0)
+#define CCB_CM0_LAST_IN_CHAIN	(0x4)
+#define CCB_CM12_STORE		(0x0)
+#define CCB_CM12_INTERRUPT	(0x1)
+
+#define CCB_SIZE		(0x10)
+#define CCB_ALIGN		CCB_SIZE
+
+struct coprocessor_completion_block {
+	__be64 value;
+	__be64 address;
+} __packed __aligned(CCB_ALIGN);
+
+
+/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */
+
+#define CSB_V			(0x80)
+#define CSB_F			(0x04)
+#define CSB_CH			(0x03)
+#define CSB_CE_INCOMPLETE	(0x80)
+#define CSB_CE_TERMINATION	(0x40)
+#define CSB_CE_TPBC		(0x20)
+
+#define CSB_CC_SUCCESS		(0)
+#define CSB_CC_INVALID_ALIGN	(1)
+#define CSB_CC_OPERAND_OVERLAP	(2)
+#define CSB_CC_DATA_LENGTH	(3)
+#define CSB_CC_TRANSLATION	(5)
+#define CSB_CC_PROTECTION	(6)
+#define CSB_CC_RD_EXTERNAL	(7)
+#define CSB_CC_INVALID_OPERAND	(8)
+#define CSB_CC_PRIVILEGE	(9)
+#define CSB_CC_INTERNAL		(10)
+#define CSB_CC_WR_EXTERNAL	(12)
+#define CSB_CC_NOSPC		(13)
+#define CSB_CC_EXCESSIVE_DDE	(14)
+#define CSB_CC_WR_TRANSLATION	(15)
+#define CSB_CC_WR_PROTECTION	(16)
+#define CSB_CC_UNKNOWN_CODE	(17)
+#define CSB_CC_ABORT		(18)
+#define CSB_CC_TRANSPORT	(20)
+#define CSB_CC_SEGMENTED_DDL	(31)
+#define CSB_CC_PROGRESS_POINT	(32)
+#define CSB_CC_DDE_OVERFLOW	(33)
+#define CSB_CC_SESSION		(34)
+#define CSB_CC_PROVISION	(36)
+#define CSB_CC_CHAIN		(37)
+#define CSB_CC_SEQUENCE		(38)
+#define CSB_CC_HW		(39)
+
+#define CSB_SIZE		(0x10)
+#define CSB_ALIGN		CSB_SIZE
+
+struct coprocessor_status_block {
+	u8 flags;
+	u8 cs;
+	u8 cc;
+	u8 ce;
+	__be32 count;
+	__be64 address;
+} __packed __aligned(CSB_ALIGN);
+
+
+/* Chapter 6.5.10 Data-Descriptor List (DDL)
+ * each list contains one or more Data-Descriptor Entries (DDE)
+ */
+
+#define DDE_P			(0x8000)
+
+#define DDE_SIZE		(0x10)
+#define DDE_ALIGN		DDE_SIZE
+
+struct data_descriptor_entry {
+	__be16 flags;
+	u8 count;
+	u8 index;
+	__be32 length;
+	__be64 address;
+} __packed __aligned(DDE_ALIGN);
+
+
+/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */
+
+#define CRB_SIZE		(0x80)
+#define CRB_ALIGN		(0x100) /* Errata: requires 256 alignment */
+
+/* Coprocessor Status Block field
+ *   ADDRESS	address of CSB
+ *   C		CCB is valid
+ *   AT		0 = addrs are virtual, 1 = addrs are phys
+ *   M		enable perf monitor
+ */
+#define CRB_CSB_ADDRESS		(0xfffffffffffffff0)
+#define CRB_CSB_C		(0x0000000000000008)
+#define CRB_CSB_AT		(0x0000000000000002)
+#define CRB_CSB_M		(0x0000000000000001)
+
+struct coprocessor_request_block {
+	__be32 ccw;
+	__be32 flags;
+	__be64 csb_addr;
+
+	struct data_descriptor_entry source;
+	struct data_descriptor_entry target;
+
+	struct coprocessor_completion_block ccb;
+
+	u8 reserved[48];
+
+	struct coprocessor_status_block csb;
+} __packed __aligned(CRB_ALIGN);
+
+
+/* RFC02167 Initiate Coprocessor Instructions document
+ * Chapter 8.2.1.1.1 RS
+ * Chapter 8.2.3 Coprocessor Directive
+ * Chapter 8.2.4 Execution
+ *
+ * The CCW must be converted to BE before passing to icswx()
+ */
+
+#define CCW_PS			(0xff000000)
+#define CCW_CT			(0x00ff0000)
+#define CCW_CD			(0x0000ffff)
+#define CCW_CL			(0x0000c000)
+
+
+/* RFC02167 Initiate Coprocessor Instructions document
+ * Chapter 8.2.1 Initiate Coprocessor Store Word Indexed (ICSWX)
+ * Chapter 8.2.4.1 Condition Register 0
+ */
+
+#define ICSWX_INITIATED		(0x8)
+#define ICSWX_BUSY		(0x4)
+#define ICSWX_REJECTED		(0x2)
+
+static inline int icswx(__be32 ccw, struct coprocessor_request_block *crb)
+{
+	__be64 ccw_reg = ccw;
+	u32 cr;
+
+	__asm__ __volatile__(
+	PPC_ICSWX(%1,0,%2) "\n"
+	"mfcr %0\n"
+	: "=r" (cr)
+	: "r" (ccw_reg), "r" (crb)
+	: "cr0", "memory");
+
+	return (int)((cr >> 28) & 0xf);
+}
+
+
+#endif /* _ARCH_POWERPC_INCLUDE_ASM_ICSWX_H_ */
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -136,6 +136,8 @@
 #define PPC_INST_DCBAL			0x7c2005ec
 #define PPC_INST_DCBZL			0x7c2007ec
 #define PPC_INST_ICBT			0x7c00002c
+#define PPC_INST_ICSWX			0x7c00032d
+#define PPC_INST_ICSWEPX		0x7c00076d
 #define PPC_INST_ISEL			0x7c00001e
 #define PPC_INST_ISEL_MASK		0xfc00003e
 #define PPC_INST_LDARX			0x7c0000a8
@@ -403,4 +405,15 @@
 #define MFTMR(tmr, r)		stringify_in_c(.long PPC_INST_MFTMR | \
 					       TMRN(tmr) | ___PPC_RT(r))

+/* Coprocessor instructions */
+#define PPC_ICSWX(s, a, b)	stringify_in_c(.long PPC_INST_ICSWX |	\
+					       ___PPC_RS(s) |		\
+					       ___PPC_RA(a) |		\
+					       ___PPC_RB(b))
+#define PPC_ICSWEPX(s, a, b)	stringify_in_c(.long PPC_INST_ICSWEPX | \
+					       ___PPC_RS(s) |		\
+					       ___PPC_RA(a) |		\
+					       ___PPC_RB(b))
+
+
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -800,6 +800,7 @@ int of_get_ibm_chip_id(struct device_node *np)
 	}
 	return -1;
 }
+EXPORT_SYMBOL(of_get_ibm_chip_id);

 /**
 * cpu_to_chip_id - Return the cpus chip-id

--- a/arch/sparc/crypto/md5_glue.c
+++ b/arch/sparc/crypto/md5_glue.c
@@ -33,10 +33,10 @@ static int md5_sparc64_init(struct shash_desc *desc)
 {
 	struct md5_state *mctx = shash_desc_ctx(desc);

-	mctx->hash[0] = cpu_to_le32(0x67452301);
-	mctx->hash[1] = cpu_to_le32(0xefcdab89);
-	mctx->hash[2] = cpu_to_le32(0x98badcfe);
-	mctx->hash[3] = cpu_to_le32(0x10325476);
+	mctx->hash[0] = cpu_to_le32(MD5_H0);
+	mctx->hash[1] = cpu_to_le32(MD5_H1);
+	mctx->hash[2] = cpu_to_le32(MD5_H2);
+	mctx->hash[3] = cpu_to_le32(MD5_H3);
 	mctx->byte_count = 0;

 	return 0;

--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -156,7 +156,7 @@ int __init crypto_fpu_init(void)
 	return crypto_register_template(&crypto_fpu_tmpl);
 }

-void __exit crypto_fpu_exit(void)
+void crypto_fpu_exit(void)
 {
 	crypto_unregister_template(&crypto_fpu_tmpl);
 }

--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -882,7 +882,8 @@ static int __init sha1_mb_mod_init(void)
 		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
 		cpu_state->cpu = cpu;
 		cpu_state->alg_state = &sha1_mb_alg_state;
-		cpu_state->mgr = (struct sha1_ctx_mgr *) kzalloc(sizeof(struct sha1_ctx_mgr), GFP_KERNEL);
+		cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
+					GFP_KERNEL);
 		if (!cpu_state->mgr)
 			goto err2;
 		sha1_ctx_mgr_init(cpu_state->mgr);

--- a/crypto/842.c
+++ b/crypto/842.c
 /*
- * Cryptographic API for the 842 compression algorithm.
+ * Cryptographic API for the 842 software compression algorithm.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -11,173 +11,73 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ * Copyright (C) IBM Corporation, 2011-2015
 *
- * Copyright (C) IBM Corporation, 2011
+ * Original Authors: Robert Jennings <rcj@linux.vnet.ibm.com>
+ *                   Seth Jennings <sjenning@linux.vnet.ibm.com>
 *
- * Authors: Robert Jennings <rcj@linux.vnet.ibm.com>
- *          Seth Jennings <sjenning@linux.vnet.ibm.com>
+ * Rewrite: Dan Streetman <ddstreet@ieee.org>
+ *
+ * This is the software implementation of compression and decompression using
+ * the 842 format.  This uses the software 842 library at lib/842/ which is
+ * only a reference implementation, and is very, very slow as compared to other
+ * software compressors.  You probably do not want to use this software
+ * compression.  If you have access to the PowerPC 842 compression hardware, you
+ * want to use the 842 hardware compression interface, which is at:
+ * drivers/crypto/nx/nx-842-crypto.c
 */

 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/crypto.h>
-#include <linux/vmalloc.h>
-#include <linux/nx842.h>
-#include <linux/lzo.h>
-#include <linux/timer.h>
-
-static int nx842_uselzo;
-
-struct nx842_ctx {
-	void *nx842_wmem; /* working memory for 842/lzo */
-};
+#include <linux/sw842.h>

-enum nx842_crypto_type {
-	NX842_CRYPTO_TYPE_842,
-	NX842_CRYPTO_TYPE_LZO
+struct crypto842_ctx {
+	char wmem[SW842_MEM_COMPRESS];	/* working memory for compress */
 };

-#define NX842_SENTINEL 0xdeadbeef
-
-struct nx842_crypto_header {
-	unsigned int sentinel; /* debug */
-	enum nx842_crypto_type type;
-};
-
-static int nx842_init(struct crypto_tfm *tfm)
-{
-	struct nx842_ctx *ctx = crypto_tfm_ctx(tfm);
-	int wmemsize;
-
-	wmemsize = max_t(int, nx842_get_workmem_size(), LZO1X_MEM_COMPRESS);
-	ctx->nx842_wmem = kmalloc(wmemsize, GFP_NOFS);
-	if (!ctx->nx842_wmem)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static void nx842_exit(struct crypto_tfm *tfm)
-{
-	struct nx842_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	kfree(ctx->nx842_wmem);
-}
-
-static void nx842_reset_uselzo(unsigned long data)
+static int crypto842_compress(struct crypto_tfm *tfm,
+			      const u8 *src, unsigned int slen,
+			      u8 *dst, unsigned int *dlen)
 {
-	nx842_uselzo = 0;
-}
-
-static DEFINE_TIMER(failover_timer, nx842_reset_uselzo, 0, 0);
-
-static int nx842_crypto_compress(struct crypto_tfm *tfm, const u8 *src,
-			    unsigned int slen, u8 *dst, unsigned int *dlen)
-{
-	struct nx842_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct nx842_crypto_header *hdr;
-	unsigned int tmp_len = *dlen;
-	size_t lzodlen; /* needed for lzo */
-	int err;
-
-	*dlen = 0;
-	hdr = (struct nx842_crypto_header *)dst;
-	hdr->sentinel = NX842_SENTINEL; /* debug */
-	dst += sizeof(struct nx842_crypto_header);
-	tmp_len -= sizeof(struct nx842_crypto_header);
-	lzodlen = tmp_len;
-
-	if (likely(!nx842_uselzo)) {
-		err = nx842_compress(src, slen, dst, &tmp_len, ctx->nx842_wmem);
-
-		if (likely(!err)) {
-			hdr->type = NX842_CRYPTO_TYPE_842;
-			*dlen = tmp_len + sizeof(struct nx842_crypto_header);
-			return 0;
-		}
-
-		/* hardware failed */
-		nx842_uselzo = 1;
+	struct crypto842_ctx *ctx = crypto_tfm_ctx(tfm);

-		/* set timer to check for hardware again in 1 second */
-		mod_timer(&failover_timer, jiffies + msecs_to_jiffies(1000));
-	}
-
-	/* no hardware, use lzo */
-	err = lzo1x_1_compress(src, slen, dst, &lzodlen, ctx->nx842_wmem);
-	if (err != LZO_E_OK)
-		return -EINVAL;
-
-	hdr->type = NX842_CRYPTO_TYPE_LZO;
-	*dlen = lzodlen + sizeof(struct nx842_crypto_header);
-	return 0;
+	return sw842_compress(src, slen, dst, dlen, ctx->wmem);
 }

-static int nx842_crypto_decompress(struct crypto_tfm *tfm, const u8 *src,
-			      unsigned int slen, u8 *dst, unsigned int *dlen)
+static int crypto842_decompress(struct crypto_tfm *tfm,
+				const u8 *src, unsigned int slen,
+				u8 *dst, unsigned int *dlen)
 {
-	struct nx842_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct nx842_crypto_header *hdr;
-	unsigned int tmp_len = *dlen;
-	size_t lzodlen; /* needed for lzo */
-	int err;
-
-	*dlen = 0;
-	hdr = (struct nx842_crypto_header *)src;
-
-	if (unlikely(hdr->sentinel != NX842_SENTINEL))
-		return -EINVAL;
-
-	src += sizeof(struct nx842_crypto_header);
-	slen -= sizeof(struct nx842_crypto_header);
-
-	if (likely(hdr->type == NX842_CRYPTO_TYPE_842)) {
-		err = nx842_decompress(src, slen, dst, &tmp_len,
-			ctx->nx842_wmem);
-		if (err)
-			return -EINVAL;
-		*dlen = tmp_len;
-	} else if (hdr->type == NX842_CRYPTO_TYPE_LZO) {
-		lzodlen = tmp_len;
-		err = lzo1x_decompress_safe(src, slen, dst, &lzodlen);
-		if (err != LZO_E_OK)
-			return -EINVAL;
-		*dlen = lzodlen;
-	} else
-		return -EINVAL;
-
-	return 0;
+	return sw842_decompress(src, slen, dst, dlen);
 }

 static struct crypto_alg alg = {
 	.cra_name		= "842",
+	.cra_driver_name	= "842-generic",
+	.cra_priority		= 100,
 	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
-	.cra_ctxsize		= sizeof(struct nx842_ctx),
+	.cra_ctxsize		= sizeof(struct crypto842_ctx),
 	.cra_module		= THIS_MODULE,
-	.cra_init		= nx842_init,
-	.cra_exit		= nx842_exit,
 	.cra_u			= { .compress = {
-	.coa_compress		= nx842_crypto_compress,
-	.coa_decompress		= nx842_crypto_decompress } }
+	.coa_compress		= crypto842_compress,
+	.coa_decompress		= crypto842_decompress } }
 };

-static int __init nx842_mod_init(void)
+static int __init crypto842_mod_init(void)
 {
-	del_timer(&failover_timer);
 	return crypto_register_alg(&alg);
 }
+module_init(crypto842_mod_init);

-static void __exit nx842_mod_exit(void)
+static void __exit crypto842_mod_exit(void)
 {
 	crypto_unregister_alg(&alg);
 }
-
-module_init(nx842_mod_init);
-module_exit(nx842_mod_exit);
+module_exit(crypto842_mod_exit);

 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("842 Compression Algorithm");
+MODULE_DESCRIPTION("842 Software Compression Algorithm");
 MODULE_ALIAS_CRYPTO("842");
+MODULE_ALIAS_CRYPTO("842-generic");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
--- a/crypto/Makefile
+++ b/crypto/Makefile
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
--- a/crypto/aead.c
+++ b/crypto/aead.c
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -127,6 +127,7 @@ EXPORT_SYMBOL_GPL(af_alg_release);

 static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
+	const u32 forbidden = CRYPTO_ALG_INTERNAL;
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct sockaddr_alg *sa = (void *)uaddr;
@@ -151,7 +152,9 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (IS_ERR(type))
 		return PTR_ERR(type);

-	private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask);
+	private = type->bind(sa->salg_name,
+			     sa->salg_feat & ~forbidden,
+			     sa->salg_mask & ~forbidden);
 	if (IS_ERR(private)) {
 		module_put(type->owner);
 		return PTR_ERR(private);

--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
--- a/crypto/algif_rng.c
+++ b/crypto/algif_rng.c
@@ -164,7 +164,7 @@ static int rng_setkey(void *private, const u8 *seed, unsigned int seedlen)
 	 * Check whether seedlen is of sufficient size is done in RNG
 	 * implementations.
 	 */
-	return crypto_rng_reset(private, (u8 *)seed, seedlen);
+	return crypto_rng_reset(private, seed, seedlen);
 }

 static const struct af_alg_type algif_type_rng = {

--- a/crypto/ansi_cprng.c
+++ b/crypto/ansi_cprng.c
--- a/crypto/authenc.c
+++ b/crypto/authenc.c
--- a/crypto/authencesn.c
+++ b/crypto/authencesn.c
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -14,6 +14,7 @@
 *
 */

+#include <crypto/aead.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/errno.h>

--- a/crypto/ccm.c
+++ b/crypto/ccm.c
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
--- a/crypto/chainiv.c
+++ b/crypto/chainiv.c
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
--- a/crypto/crypto_null.c
+++ b/crypto/crypto_null.c
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
--- a/crypto/echainiv.c
+++ b/crypto/echainiv.c
--- a/crypto/eseqiv.c
+++ b/crypto/eseqiv.c
--- a/crypto/fips.c
+++ b/crypto/fips.c
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
--- a/crypto/internal.h
+++ b/crypto/internal.h
--- a/crypto/jitterentropy.c
+++ b/crypto/jitterentropy.c
--- a/crypto/krng.c
+++ b/crypto/krng.c
--- a/crypto/md5.c
+++ b/crypto/md5.c
--- a/crypto/pcompress.c
+++ b/crypto/pcompress.c
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
--- a/crypto/proc.c
+++ b/crypto/proc.c
--- a/crypto/rng.c
+++ b/crypto/rng.c
--- a/crypto/rsa.c
+++ b/crypto/rsa.c
--- a/crypto/rsa_helper.c
+++ b/crypto/rsa_helper.c
--- a/crypto/rsakey.asn1
+++ b/crypto/rsakey.asn1
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
--- a/crypto/seqiv.c
+++ b/crypto/seqiv.c
--- a/crypto/shash.c
+++ b/crypto/shash.c
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
--- a/crypto/tcrypt.h
+++ b/crypto/tcrypt.h
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
--- a/crypto/zlib.c
+++ b/crypto/zlib.c
--- a/drivers/bus/mvebu-mbus.c
+++ b/drivers/bus/mvebu-mbus.c
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
--- a/drivers/crypto/caam/Kconfig
+++ b/drivers/crypto/caam/Kconfig
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
--- a/drivers/crypto/caam/compat.h
+++ b/drivers/crypto/caam/compat.h
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
--- a/drivers/crypto/caam/sg_sw_sec4.h
+++ b/drivers/crypto/caam/sg_sw_sec4.h
--- a/drivers/crypto/ccp/Kconfig
+++ b/drivers/crypto/ccp/Kconfig
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
--- a/drivers/crypto/ccp/ccp-platform.c
+++ b/drivers/crypto/ccp/ccp-platform.c
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
--- a/drivers/crypto/marvell/Makefile
+++ b/drivers/crypto/marvell/Makefile
--- a/drivers/crypto/marvell/cesa.c
+++ b/drivers/crypto/marvell/cesa.c
--- a/drivers/crypto/marvell/cesa.h
+++ b/drivers/crypto/marvell/cesa.h
--- a/drivers/crypto/marvell/cipher.c
+++ b/drivers/crypto/marvell/cipher.c
--- a/drivers/crypto/marvell/hash.c
+++ b/drivers/crypto/marvell/hash.c
--- a/drivers/crypto/marvell/tdma.c
+++ b/drivers/crypto/marvell/tdma.c
--- a/drivers/crypto/mv_cesa.c
+++ b/drivers/crypto/mv_cesa.c
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
--- a/drivers/crypto/nx/Kconfig
+++ b/drivers/crypto/nx/Kconfig
--- a/drivers/crypto/nx/Makefile
+++ b/drivers/crypto/nx/Makefile
--- a/drivers/crypto/nx/nx-842-crypto.c
+++ b/drivers/crypto/nx/nx-842-crypto.c
--- a/drivers/crypto/nx/nx-842-platform.c
+++ b/drivers/crypto/nx/nx-842-platform.c
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
--- a/drivers/crypto/nx/nx-842-pseries.c
+++ b/drivers/crypto/nx/nx-842-pseries.c
--- a/drivers/crypto/nx/nx-842.c
+++ b/drivers/crypto/nx/nx-842.c
--- a/drivers/crypto/nx/nx-842.h
+++ b/drivers/crypto/nx/nx-842.h
--- a/drivers/crypto/nx/nx-aes-gcm.c
+++ b/drivers/crypto/nx/nx-aes-gcm.c
--- a/drivers/crypto/nx/nx-sha256.c
+++ b/drivers/crypto/nx/nx-sha256.c
--- a/drivers/crypto/nx/nx-sha512.c
+++ b/drivers/crypto/nx/nx-sha512.c
--- a/drivers/crypto/nx/nx.c
+++ b/drivers/crypto/nx/nx.c
--- a/drivers/crypto/nx/nx.h
+++ b/drivers/crypto/nx/nx.h
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
--- a/drivers/crypto/qat/Kconfig
+++ b/drivers/crypto/qat/Kconfig
--- a/drivers/crypto/qat/qat_common/adf_accel_devices.h
+++ b/drivers/crypto/qat/qat_common/adf_accel_devices.h
--- a/drivers/crypto/qat/qat_common/adf_cfg_user.h
+++ b/drivers/crypto/qat/qat_common/adf_cfg_user.h
--- a/drivers/crypto/qat/qat_common/adf_common_drv.h
+++ b/drivers/crypto/qat/qat_common/adf_common_drv.h
--- a/drivers/crypto/qat/qat_common/adf_ctl_drv.c
+++ b/drivers/crypto/qat/qat_common/adf_ctl_drv.c
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
--- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
--- a/drivers/crypto/sahara.c
+++ b/drivers/crypto/sahara.c
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
--- a/drivers/crypto/talitos.h
+++ b/drivers/crypto/talitos.h
--- a/drivers/crypto/ux500/Kconfig
+++ b/drivers/crypto/ux500/Kconfig
--- a/drivers/crypto/vmx/Kconfig
+++ b/drivers/crypto/vmx/Kconfig
--- a/drivers/crypto/vmx/Makefile
+++ b/drivers/crypto/vmx/Makefile
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
--- a/drivers/crypto/vmx/aesp8-ppc.h
+++ b/drivers/crypto/vmx/aesp8-ppc.h
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
--- a/drivers/crypto/vmx/vmx.c
+++ b/drivers/crypto/vmx/vmx.c
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
--- a/include/crypto/compress.h
+++ b/include/crypto/compress.h
--- a/include/crypto/cryptd.h
+++ b/include/crypto/cryptd.h
--- a/include/crypto/drbg.h
+++ b/include/crypto/drbg.h
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
--- a/include/crypto/internal/aead.h
+++ b/include/crypto/internal/aead.h
--- a/include/crypto/internal/akcipher.h
+++ b/include/crypto/internal/akcipher.h
--- a/include/crypto/internal/geniv.h
+++ b/include/crypto/internal/geniv.h
--- a/include/crypto/internal/rng.h
+++ b/include/crypto/internal/rng.h
--- a/include/crypto/internal/rsa.h
+++ b/include/crypto/internal/rsa.h
--- a/include/crypto/md5.h
+++ b/include/crypto/md5.h
--- a/include/crypto/null.h
+++ b/include/crypto/null.h
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
--- a/include/crypto/scatterwalk.h
+++ b/include/crypto/scatterwalk.h
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
--- a/include/linux/module.h
+++ b/include/linux/module.h
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
--- a/include/linux/nx842.h
+++ b/include/linux/nx842.h
--- a/include/linux/random.h
+++ b/include/linux/random.h
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
--- a/include/linux/sw842.h
+++ b/include/linux/sw842.h
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
--- a/include/linux/cryptouser.h
+++ b/include/linux/cryptouser.h
--- a/lib/842/842.h
+++ b/lib/842/842.h
--- a/lib/842/842_compress.c
+++ b/lib/842/842_compress.c
--- a/lib/842/842_debugfs.h
+++ b/lib/842/842_debugfs.h
--- a/lib/842/842_decompress.c
+++ b/lib/842/842_decompress.c
--- a/lib/842/Makefile
+++ b/lib/842/Makefile
--- a/lib/Kconfig
+++ b/lib/Kconfig
--- a/lib/Makefile
+++ b/lib/Makefile
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
--- a/net/mac80211/aes_gcm.c
+++ b/net/mac80211/aes_gcm.c
--- a/net/mac80211/aes_gmac.c
+++ b/net/mac80211/aes_gmac.c
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c