From 8e7f966bf3579629ce553f9512ee6952588c02a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ulf=20M=C3=B6ller?= <ulf@openssl.org>
Date: Wed, 5 May 1999 00:23:53 +0000
Subject: [PATCH] SHA-1 cleanups and performance enhancements.

Submitted by: Andy Polyakov <appro@fy.chalmers.se>
---
 CHANGES                 |   5 +-
 Configure               |   3 +
 crypto/opensslconf.h.in |  13 ++--
 crypto/sha/sha.h        |  26 +++++--
 crypto/sha/sha1dgst.c   | 153 +++++++++++++++++++++++--------------
 crypto/sha/sha_dgst.c   | 163 +++++++++++++++++++++++++++-------------
 crypto/sha/sha_locl.h   |  68 ++++++++++++++---
 7 files changed, 298 insertions(+), 133 deletions(-)
diff --git a/CHANGES b/CHANGES
index 8f18223c71..519dca970c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -5,8 +5,11 @@
 
  Changes between 0.9.2b and 0.9.3
 
+  *) SHA-1 cleanups and performance enhancements.
+     [Andy Polyakov <appro@fy.chalmers.se>]
+
   *) Sparc v8plus assembler for the bignum library.
-    [Andy Polyakov <appro@fy.chalmers.se>]
+     [Andy Polyakov <appro@fy.chalmers.se>]
 
   *) Accept any -xxx and +xxx compiler options in Configure.
      [Ulf Möller]
diff --git a/Configure b/Configure
index eff6b1e42b..f4e97b4b9a 100755
--- a/Configure
+++ b/Configure
@@ -587,6 +587,9 @@ while (<IN>)
 		{ printf OUT "#define RC4_INT unsigned %s\n",$type[$rc4_int]; }
 	elsif	(/^#((define)|(undef))\s+RC4_INDEX/)
 		{ printf OUT "#%s RC4_INDEX\n",($rc4_idx)?"define":"undef"; }
+	elsif (/^#(define|undef)\s+I386_ONLY/)
+		{ printf OUT "#%s I386_ONLY\n", ($processor == 386)?
+			"define":"undef"; }
 	elsif	(/^#define\s+MD2_INT\s/)
 		{ printf OUT "#define MD2_INT unsigned %s\n",$type[$md2_int]; }
 	elsif	(/^#define\s+IDEA_INT\s/)
diff --git a/crypto/opensslconf.h.in b/crypto/opensslconf.h.in
index cd05361eb8..4e28f3e666 100644
--- a/crypto/opensslconf.h.in
+++ b/crypto/opensslconf.h.in
@@ -1,6 +1,9 @@
 /* crypto/opensslconf.h */
 /* WARNING: This file is autogenerated by Configure */
 
+/* Generate 80386 code? */
+#undef I386_ONLY
+
 #if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
 #define OPENSSLDIR "/usr/local/ssl"
 #endif
@@ -34,7 +37,7 @@
 
 #if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
 #define CONFIG_HEADER_BN_H
-#define BN_LLONG
+#undef BN_LLONG
 
 /* Should we define BN_DIV2W here? */
 
@@ -53,7 +56,7 @@
 #define CONFIG_HEADER_RC4_LOCL_H
 /* if this is defined data[i] is used instead of *data, this is a %20
  * speedup on x86 */
-#define RC4_INDEX
+#undef RC4_INDEX
 #endif
 
 #if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
@@ -67,14 +70,14 @@
 /* the following is tweaked from a config script, that is why it is a
  * protected undef/define */
 #ifndef DES_PTR
-#define DES_PTR
+#undef DES_PTR
 #endif
 
 /* This helps C compiler generate the correct code for multiple functional
  * units.  It reduces register dependancies at the expense of 2 more
  * registers */
 #ifndef DES_RISC1
-#define DES_RISC1
+#undef DES_RISC1
 #endif
 
 #ifndef DES_RISC2
@@ -88,7 +91,7 @@ YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
 /* Unroll the inner loop, this sometimes helps, sometimes hinders.
  * Very mucy CPU dependant */
 #ifndef DES_UNROLL
-#define DES_UNROLL
+#undef DES_UNROLL
 #endif
 
 /* These default values were supplied by
diff --git a/crypto/sha/sha.h b/crypto/sha/sha.h
index ba40aafc13..cd6960ee1a 100644
--- a/crypto/sha/sha.h
+++ b/crypto/sha/sha.h
@@ -67,18 +67,28 @@ extern "C" {
 #error SHA is disabled.
 #endif
 
-#define SHA_CBLOCK	64
-#define SHA_LBLOCK	16
-#define SHA_BLOCK	16
-#define SHA_LAST_BLOCK  56
-#define SHA_LENGTH_BLOCK 8
-#define SHA_DIGEST_LENGTH 20
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then !
+ * ! SHA_LONG_LOG2 has to be defined along.                        !
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
 
-#ifdef WIN16
+#if defined(WIN16) || defined(__LP32__)
+#define SHA_LONG unsigned long
+#elif defined(_CRAY) || defined(__ILP64__)
 #define SHA_LONG unsigned long
+#define SHA_LONG_LOG2 3
 #else
 #define SHA_LONG unsigned int
-#endif	
+#endif
+
+#define SHA_LBLOCK	16
+#define SHA_CBLOCK	(SHA_LBLOCK*4)	/* SHA treats input data as a
+					 * contiguous array of 32 bit
+					 * wide big-endian values. */
+#define SHA_LAST_BLOCK  (SHA_CBLOCK-8)
+#define SHA_DIGEST_LENGTH 20
 
 typedef struct SHAstate_st
 	{
diff --git a/crypto/sha/sha1dgst.c b/crypto/sha/sha1dgst.c
index f4a47f3768..e867f6972b 100644
--- a/crypto/sha/sha1dgst.c
+++ b/crypto/sha/sha1dgst.c
@@ -81,14 +81,14 @@ char *SHA1_version="SHA1" OPENSSL_VERSION_PTEXT;
 #define K_40_59 0x8f1bbcdcUL
 #define K_60_79 0xca62c1d6UL
 
-#  ifdef SHA1_ASM
-     void sha1_block_x86(SHA_CTX *c, register SHA_LONG *p, int num);
-#    define sha1_block sha1_block_x86
-#  else
-     void sha1_block(SHA_CTX *c, register SHA_LONG *p, int num);
-#  endif
+#ifdef SHA1_ASM
+   void sha1_block_x86(SHA_CTX *c, register SHA_LONG *p, int num);
+#  define sha1_block(c,p,n) sha1_block_x86((c),(p),(n)*SHA_CBLOCK)
+#else
+   static void sha1_block(SHA_CTX *c, register SHA_LONG *p, int num);
+#endif
 
-#if defined(L_ENDIAN) && defined(SHA1_ASM)
+#if !defined(B_ENDIAN) && defined(SHA1_ASM)
 #  define	M_c2nl 		c2l
 #  define	M_p_c2nl 	p_c2l
 #  define	M_c2nl_p	c2l_p
@@ -147,7 +147,7 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data,
 				}
 			len-=(SHA_CBLOCK-c->num);
 
-			sha1_block(c,p,64);
+			sha1_block(c,p,1);
 			c->num=0;
 			/* drop through and do the rest */
 			}
@@ -184,15 +184,15 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data,
 	 * copies it to a local array.  I should be able to do this for
 	 * the C version as well....
 	 */
-#if 1
+#if SHA_LONG_LOG2==2
 #if defined(B_ENDIAN) || defined(SHA1_ASM)
 	if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0)
 		{
 		sw=len/SHA_CBLOCK;
 		if (sw)
 			{
-			sw*=SHA_CBLOCK;
 			sha1_block(c,(SHA_LONG *)data,sw);
+			sw*=SHA_CBLOCK;
 			data+=sw;
 			len-=sw;
 			}
@@ -204,35 +204,61 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data,
 	p=c->data;
 	while (len >= SHA_CBLOCK)
 		{
-#if defined(B_ENDIAN) || defined(L_ENDIAN)
+#if SHA_LONG_LOG2==2
+#if defined(B_ENDIAN) || defined(SHA1_ASM)
+#define SHA_NO_TAIL_CODE
+		/*
+		 * Basically we get here only when data happens
+		 * to be unaligned.
+		 */
 		if (p != (SHA_LONG *)data)
 			memcpy(p,data,SHA_CBLOCK);
 		data+=SHA_CBLOCK;
-#  ifdef L_ENDIAN
-#    ifndef SHA1_ASM /* Will not happen */
-		for (sw=(SHA_LBLOCK/4); sw; sw--)
+		sha1_block(c,p=c->data,1);
+		len-=SHA_CBLOCK;
+#else	/* little-endian */
+#define BE_COPY(dst,src,i)	{				\
+				l = ((SHA_LONG *)src)[i];	\
+				Endian_Reverse32(l);		\
+				dst[i] = l;			\
+				}
+		if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0)
 			{
-			Endian_Reverse32(p[0]);
-			Endian_Reverse32(p[1]);
-			Endian_Reverse32(p[2]);
-			Endian_Reverse32(p[3]);
-			p+=4;
+			for (sw=(SHA_LBLOCK/4); sw; sw--)
+				{
+				BE_COPY(p,data,0);
+				BE_COPY(p,data,1);
+				BE_COPY(p,data,2);
+				BE_COPY(p,data,3);
+				p+=4;
+				data += 4*sizeof(SHA_LONG);
+				}
+			sha1_block(c,p=c->data,1);
+			len-=SHA_CBLOCK;
+			continue;
 			}
+#endif
+#endif
+#ifndef SHA_NO_TAIL_CODE
+		/*
+		 * In addition to "sizeof(SHA_LONG)!= 4" case the
+		 * following code covers unaligned access cases on
+		 * little-endian machines.
+		 *			<appro@fy.chalmers.se>
+		 */
 		p=c->data;
-#    endif
-#  endif
-#else
-		for (sw=(SHA_BLOCK/4); sw; sw--)
+		for (sw=(SHA_LBLOCK/4); sw; sw--)
 			{
-			M_c2nl(data,l); *(p++)=l;
-			M_c2nl(data,l); *(p++)=l;
-			M_c2nl(data,l); *(p++)=l;
-			M_c2nl(data,l); *(p++)=l;
+			M_c2nl(data,l); p[0]=l;
+			M_c2nl(data,l); p[1]=l;
+			M_c2nl(data,l); p[2]=l;
+			M_c2nl(data,l); p[3]=l;
+			p+=4;
 			}
 		p=c->data;
-#endif
-		sha1_block(c,p,64);
+		sha1_block(c,p,1);
 		len-=SHA_CBLOCK;
+#endif
 		}
 	ec=(int)len;
 	c->num=ec;
@@ -247,26 +273,35 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data,
 
 void SHA1_Transform(SHA_CTX *c, unsigned char *b)
 	{
-	SHA_LONG p[16];
-#ifndef B_ENDIAN
+	SHA_LONG p[SHA_LBLOCK];
 	SHA_LONG *q;
 	int i;
-#endif
 
-#if defined(B_ENDIAN) || defined(L_ENDIAN)
-	memcpy(p,b,64);
-#ifdef L_ENDIAN
-	q=p;
-	for (i=(SHA_LBLOCK/4); i; i--)
+#if SHA_LONG_LOG2==2
+#if defined(B_ENDIAN) || defined(SHA1_ASM)
+	memcpy(p,b,SHA_CBLOCK);
+	sha1_block(c,p,1);
+	return;
+#else
+	if (((unsigned long)b%sizeof(SHA_LONG)) == 0)
 		{
-		Endian_Reverse32(q[0]);
-		Endian_Reverse32(q[1]);
-		Endian_Reverse32(q[2]);
-		Endian_Reverse32(q[3]);
-		q+=4;
+		q=p;
+		for (i=(SHA_LBLOCK/4); i; i--)
+			{
+			unsigned long l;
+			BE_COPY(q,b,0);	/* BE_COPY was defined above */
+			BE_COPY(q,b,1);
+			BE_COPY(q,b,2);
+			BE_COPY(q,b,3);
+			q+=4;
+			b+=4*sizeof(SHA_LONG);
+			}
+		sha1_block(c,p,1);
+		return;
 		}
 #endif
-#else
+#endif
+#ifndef SHA_NO_TAIL_CODE /* defined above, see comment */
 	q=p;
 	for (i=(SHA_LBLOCK/4); i; i--)
 		{
@@ -276,16 +311,15 @@ void SHA1_Transform(SHA_CTX *c, unsigned char *b)
 		c2nl(b,l); *(q++)=l;
 		c2nl(b,l); *(q++)=l; 
 		} 
+	sha1_block(c,p,1);
 #endif
-	sha1_block(c,p,64);
 	}
 
 #ifndef SHA1_ASM
-
-void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num)
+static void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num)
 	{
 	register SHA_LONG A,B,C,D,E,T;
-	SHA_LONG X[16];
+	SHA_LONG X[SHA_LBLOCK];
 
 	A=c->h0;
 	B=c->h1;
@@ -385,8 +419,7 @@ void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num)
 	c->h3=(c->h3+B)&0xffffffffL;
 	c->h4=(c->h4+C)&0xffffffffL;
 
-	num-=64;
-	if (num <= 0) break;
+	if (--num <= 0) break;
 
 	A=c->h0;
 	B=c->h1;
@@ -394,7 +427,12 @@ void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num)
 	D=c->h3;
 	E=c->h4;
 
-	W+=16;
+	W+=SHA_LBLOCK;	/* Note! This can happen only when sizeof(SHA_LONG)
+			 * is 4. Whenever it's not the actual case this
+			 * function is never called with num larger than 1
+			 * and we never advance down here.
+			 *			<appro@fy.chalmers.se>
+			 */
 		}
 	}
 #endif
@@ -423,18 +461,20 @@ void SHA1_Final(unsigned char *md, SHA_CTX *c)
 		{
 		for (; i<SHA_LBLOCK; i++)
 			p[i]=0;
-		sha1_block(c,p,64);
+		sha1_block(c,p,1);
 		i=0;
 		}
 	for (; i<(SHA_LBLOCK-2); i++)
 		p[i]=0;
 	p[SHA_LBLOCK-2]=c->Nh;
 	p[SHA_LBLOCK-1]=c->Nl;
-#if defined(L_ENDIAN) && defined(SHA1_ASM)
+#if SHA_LONG_LOG2==2
+#if !defined(B_ENDIAN) && defined(SHA1_ASM)
 	Endian_Reverse32(p[SHA_LBLOCK-2]);
 	Endian_Reverse32(p[SHA_LBLOCK-1]);
 #endif
-	sha1_block(c,p,64);
+#endif
+	sha1_block(c,p,1);
 	cp=md;
 	l=c->h0; nl2c(l,cp);
 	l=c->h1; nl2c(l,cp);
@@ -442,10 +482,11 @@ void SHA1_Final(unsigned char *md, SHA_CTX *c)
 	l=c->h3; nl2c(l,cp);
 	l=c->h4; nl2c(l,cp);
 
-	/* clear stuff, sha1_block may be leaving some stuff on the stack
-	 * but I'm not worried :-) */
 	c->num=0;
-/*	memset((char *)&c,0,sizeof(c));*/
+	/* sha_block may be leaving some stuff on the stack
+	 * but I'm not worried :-)
+	memset((void *)c,0,sizeof(SHA_CTX));
+	 */
 	}
 #endif
 
diff --git a/crypto/sha/sha_dgst.c b/crypto/sha/sha_dgst.c
index 5827c73cea..d90f497763 100644
--- a/crypto/sha/sha_dgst.c
+++ b/crypto/sha/sha_dgst.c
@@ -81,12 +81,21 @@ char *SHA_version="SHA" OPENSSL_VERSION_PTEXT;
 #define K_40_59 0x8f1bbcdcUL
 #define K_60_79 0xca62c1d6UL
 
-   void sha_block(SHA_CTX *c, register SHA_LONG *p, int num);
-#define	M_c2nl 		c2nl
-#define	M_p_c2nl	p_c2nl
-#define	M_c2nl_p	c2nl_p
-#define	M_p_c2nl_p	p_c2nl_p
-#define	M_nl2c		nl2c
+static void sha_block(SHA_CTX *c, register SHA_LONG *p, int num);
+
+#if !defined(B_ENDIAN) && defined(SHA_ASM)
+#  define	M_c2nl 		c2l
+#  define	M_p_c2nl 	p_c2l
+#  define	M_c2nl_p	c2l_p
+#  define	M_p_c2nl_p	p_c2l_p
+#  define	M_nl2c		l2c
+#else
+#  define	M_c2nl 		c2nl
+#  define	M_p_c2nl	p_c2nl
+#  define	M_c2nl_p	c2nl_p
+#  define	M_p_c2nl_p	p_c2nl_p
+#  define	M_nl2c		nl2c
+#endif
 
 void SHA_Init(SHA_CTX *c)
 	{
@@ -133,7 +142,7 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data,
 				}
 			len-=(SHA_CBLOCK-c->num);
 
-			sha_block(c,p,64);
+			sha_block(c,p,1);
 			c->num=0;
 			/* drop through and do the rest */
 			}
@@ -170,15 +179,15 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data,
 	 * copies it to a local array.  I should be able to do this for
 	 * the C version as well....
 	 */
-#if 1
+#if SHA_LONG_LOG2==2
 #if defined(B_ENDIAN) || defined(SHA_ASM)
 	if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0)
 		{
 		sw=len/SHA_CBLOCK;
 		if (sw)
 			{
-			sw*=SHA_CBLOCK;
 			sha_block(c,(SHA_LONG *)data,sw);
+			sw*=SHA_CBLOCK;
 			data+=sw;
 			len-=sw;
 			}
@@ -190,35 +199,61 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data,
 	p=c->data;
 	while (len >= SHA_CBLOCK)
 		{
-#if defined(B_ENDIAN) || defined(L_ENDIAN)
+#if SHA_LONG_LOG2==2
+#if defined(B_ENDIAN) || defined(SHA_ASM)
+#define SHA_NO_TAIL_CODE
+		/*
+		 * Basically we get here only when data happens
+		 * to be unaligned.
+		 */
 		if (p != (SHA_LONG *)data)
 			memcpy(p,data,SHA_CBLOCK);
 		data+=SHA_CBLOCK;
-#  ifdef L_ENDIAN
-#    ifndef SHA_ASM /* Will not happen */
-		for (sw=(SHA_LBLOCK/4); sw; sw--)
+		sha_block(c,p=c->data,1);
+		len-=SHA_CBLOCK;
+#else	/* little-endian */
+#define BE_COPY(dst,src,i)	{				\
+				l = ((SHA_LONG *)src)[i];	\
+				Endian_Reverse32(l);		\
+				dst[i] = l;			\
+				}
+		if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0)
 			{
-			Endian_Reverse32(p[0]);
-			Endian_Reverse32(p[1]);
-			Endian_Reverse32(p[2]);
-			Endian_Reverse32(p[3]);
-			p+=4;
+			for (sw=(SHA_LBLOCK/4); sw; sw--)
+				{
+				BE_COPY(p,data,0);
+				BE_COPY(p,data,1);
+				BE_COPY(p,data,2);
+				BE_COPY(p,data,3);
+				p+=4;
+				data += 4*sizeof(SHA_LONG);
+				}
+			sha_block(c,p=c->data,1);
+			len-=SHA_CBLOCK;
+			continue;
 			}
+#endif
+#endif
+#ifndef SHA_NO_TAIL_CODE
+		/*
+		 * In addition to "sizeof(SHA_LONG)!= 4" case the
+		 * following code covers unaligned access cases on
+		 * little-endian machines.
+		 *			<appro@fy.chalmers.se>
+		 */
 		p=c->data;
-#    endif
-#  endif
-#else
-		for (sw=(SHA_BLOCK/4); sw; sw--)
+		for (sw=(SHA_LBLOCK/4); sw; sw--)
 			{
-			M_c2nl(data,l); *(p++)=l;
-			M_c2nl(data,l); *(p++)=l;
-			M_c2nl(data,l); *(p++)=l;
-			M_c2nl(data,l); *(p++)=l;
+			M_c2nl(data,l); p[0]=l;
+			M_c2nl(data,l); p[1]=l;
+			M_c2nl(data,l); p[2]=l;
+			M_c2nl(data,l); p[3]=l;
+			p+=4;
 			}
 		p=c->data;
-#endif
-		sha_block(c,p,64);
+		sha_block(c,p,1);
 		len-=SHA_CBLOCK;
+#endif
 		}
 	ec=(int)len;
 	c->num=ec;
@@ -233,26 +268,35 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data,
 
 void SHA_Transform(SHA_CTX *c, unsigned char *b)
 	{
-	SHA_LONG p[16];
-#if !defined(B_ENDIAN)
+	SHA_LONG p[SHA_LBLOCK];
 	SHA_LONG *q;
 	int i;
-#endif
 
-#if defined(B_ENDIAN) || defined(L_ENDIAN)
-	memcpy(p,b,64);
-#ifdef L_ENDIAN
-	q=p;
-	for (i=(SHA_LBLOCK/4); i; i--)
+#if SHA_LONG_LOG2==2
+#if defined(B_ENDIAN) || defined(SHA_ASM)
+	memcpy(p,b,SHA_CBLOCK);
+	sha_block(c,p,1);
+	return;
+#else
+	if (((unsigned long)b%sizeof(SHA_LONG)) == 0)
 		{
-		Endian_Reverse32(q[0]);
-		Endian_Reverse32(q[1]);
-		Endian_Reverse32(q[2]);
-		Endian_Reverse32(q[3]);
-		q+=4;
+		q=p;
+		for (i=(SHA_LBLOCK/4); i; i--)
+			{
+			unsigned long l;
+			BE_COPY(q,b,0);	/* BE_COPY was defined above */
+			BE_COPY(q,b,1);
+			BE_COPY(q,b,2);
+			BE_COPY(q,b,3);
+			q+=4;
+			b+=4*sizeof(SHA_LONG);
+			}
+		sha_block(c,p,1);
+		return;
 		}
 #endif
-#else
+#endif
+#ifndef SHA_NO_TAIL_CODE /* defined above, see comment */
 	q=p;
 	for (i=(SHA_LBLOCK/4); i; i--)
 		{
@@ -262,14 +306,15 @@ void SHA_Transform(SHA_CTX *c, unsigned char *b)
 		c2nl(b,l); *(q++)=l;
 		c2nl(b,l); *(q++)=l; 
 		} 
+	sha_block(c,p,1);
 #endif
-	sha_block(c,p,64);
 	}
 
-void sha_block(SHA_CTX *c, register SHA_LONG *W, int num)
+#ifndef SHA_ASM
+static void sha_block(SHA_CTX *c, register SHA_LONG *W, int num)
 	{
 	register SHA_LONG A,B,C,D,E,T;
-	SHA_LONG X[16];
+	SHA_LONG X[SHA_LBLOCK];
 
 	A=c->h0;
 	B=c->h1;
@@ -369,8 +414,7 @@ void sha_block(SHA_CTX *c, register SHA_LONG *W, int num)
 	c->h3=(c->h3+B)&0xffffffffL;
 	c->h4=(c->h4+C)&0xffffffffL;
 
-	num-=64;
-	if (num <= 0) break;
+	if (--num <= 0) break;
 
 	A=c->h0;
 	B=c->h1;
@@ -378,9 +422,15 @@ void sha_block(SHA_CTX *c, register SHA_LONG *W, int num)
 	D=c->h3;
 	E=c->h4;
 
-	W+=16;
+	W+=SHA_LBLOCK;	/* Note! This can happen only when sizeof(SHA_LONG)
+			 * is 4. Whenever it's not the actual case this
+			 * function is never called with num larger than 1
+			 * and we never advance down here.
+			 *			<appro@fy.chalmers.se>
+			 */
 		}
 	}
+#endif
 
 void SHA_Final(unsigned char *md, SHA_CTX *c)
 	{
@@ -406,14 +456,20 @@ void SHA_Final(unsigned char *md, SHA_CTX *c)
 		{
 		for (; i<SHA_LBLOCK; i++)
 			p[i]=0;
-		sha_block(c,p,64);
+		sha_block(c,p,1);
 		i=0;
 		}
 	for (; i<(SHA_LBLOCK-2); i++)
 		p[i]=0;
 	p[SHA_LBLOCK-2]=c->Nh;
 	p[SHA_LBLOCK-1]=c->Nl;
-	sha_block(c,p,64);
+#if SHA_LONG_LOG2==2
+#if !defined(B_ENDIAN) && defined(SHA_ASM)
+	Endian_Reverse32(p[SHA_LBLOCK-2]);
+	Endian_Reverse32(p[SHA_LBLOCK-1]);
+#endif
+#endif
+	sha_block(c,p,1);
 	cp=md;
 	l=c->h0; nl2c(l,cp);
 	l=c->h1; nl2c(l,cp);
@@ -421,9 +477,10 @@ void SHA_Final(unsigned char *md, SHA_CTX *c)
 	l=c->h3; nl2c(l,cp);
 	l=c->h4; nl2c(l,cp);
 
-	/* clear stuff, sha_block may be leaving some stuff on the stack
-	 * but I'm not worried :-) */
 	c->num=0;
-/*	memset((char *)&c,0,sizeof(c));*/
+	/* sha_block may be leaving some stuff on the stack
+	 * but I'm not worried :-)
+	memset((void *)c,0,sizeof(SHA_CTX));
+	 */
 	}
 #endif
diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h
index 9f1251e787..32bbe30afd 100644
--- a/crypto/sha/sha_locl.h
+++ b/crypto/sha/sha_locl.h
@@ -158,30 +158,79 @@
 			 *((c)++)=(unsigned char)(((l)>>16)&0xff), \
 			 *((c)++)=(unsigned char)(((l)>>24)&0xff))
 
+#ifndef SHA_LONG_LOG2
+#define SHA_LONG_LOG2	2	/* default to 32 bits */
+#endif
+
 #undef ROTATE
+#undef Endian_Reverse32
 #if defined(WIN32)
 #define ROTATE(a,n)     _lrotl(a,n)
-#else
-#define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
+#elif defined(__GNUC__)
+/* some inline assembler templates by <appro@fy.chalmers.se> */
+#if defined(__i386)
+#define ROTATE(a,n)	({ register unsigned int ret;	\
+				asm ("roll %1,%0"	\
+				: "=r"(ret)		\
+				: "I"(n), "0"(a)	\
+				: "cc");		\
+			   ret;				\
+			})
+#ifndef I386_ONLY
+#define Endian_Reverse32(a) \
+			{ register unsigned int l=(a);	\
+				asm ("bswapl %0"	\
+				: "=r"(l) : "0"(l));	\
+			  (a)=l;			\
+			}
+#endif
+#elif defined(__powerpc)
+#define ROTATE(a,n)	({ register unsigned int ret;		\
+				asm ("rlwinm %0,%1,%2,0,31"	\
+				: "=r"(ret)			\
+				: "r"(a), "I"(n));		\
+			   ret;					\
+			})
+/* Endian_Reverse32 is not needed for PowerPC */
+#endif
 #endif
 
 /* A nice byte order reversal from Wei Dai <weidai@eskimo.com> */
-#if defined(WIN32)
+#ifdef ROTATE
+#ifndef Endian_Reverse32
 /* 5 instructions with rotate instruction, else 9 */
 #define Endian_Reverse32(a) \
 	{ \
-	unsigned long l=(a); \
-	(a)=((ROTATE(l,8)&0x00FF00FF)|(ROTATE(l,24)&0xFF00FF00)); \
+	unsigned long t=(a); \
+	(a)=((ROTATE(t,8)&0x00FF00FF)|(ROTATE((t&0x00FF00FF),24))); \
 	}
+#endif
 #else
+#define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
+#ifndef Endian_Reverse32
 /* 6 instructions with rotate instruction, else 8 */
 #define Endian_Reverse32(a) \
 	{ \
-	unsigned long l=(a); \
-	l=(((l&0xFF00FF00)>>8L)|((l&0x00FF00FF)<<8L)); \
-	(a)=ROTATE(l,16L); \
+	unsigned long t=(a); \
+	t=(((t>>8)&0x00FF00FF)|((t&0x00FF00FF)<<8)); \
+	(a)=ROTATE(t,16); \
 	}
 #endif
+/*
+ * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|...
+ * It's rewritten as above for two reasons:
+ *	- RISCs aren't good at long constants and have to explicitely
+ *	  compose 'em with several (well, usually 2) instructions in a
+ *	  register before performing the actual operation and (as you
+ *	  already realized:-) having same constant should inspire the
+ *	  compiler to permanently allocate the only register for it;
+ *	- most modern CPUs have two ALUs, but usually only one has
+ *	  circuitry for shifts:-( this minor tweak inspires compiler
+ *	  to schedule shift instructions in a better way...
+ *
+ *				<appro@fy.chalmers.se>
+ */
+#endif
 
 /* As  pointed out by Wei Dai <weidai@eskimo.com>, F() below can be
  * simplified to the code in F_00_19.  Wei attributes these optimisations
@@ -195,13 +244,12 @@
 #define F_40_59(b,c,d)	(((b) & (c)) | (((b)|(c)) & (d))) 
 #define	F_60_79(b,c,d)	F_20_39(b,c,d)
 
-#ifdef SHA_0
 #undef Xupdate
+#ifdef SHA_0
 #define Xupdate(a,i,ia,ib,ic,id) X[(i)&0x0f]=(a)=\
 	(ia[(i)&0x0f]^ib[((i)+2)&0x0f]^ic[((i)+8)&0x0f]^id[((i)+13)&0x0f]);
 #endif
 #ifdef SHA_1
-#undef Xupdate
 #define Xupdate(a,i,ia,ib,ic,id) (a)=\
 	(ia[(i)&0x0f]^ib[((i)+2)&0x0f]^ic[((i)+8)&0x0f]^id[((i)+13)&0x0f]);\
 	X[(i)&0x0f]=(a)=ROTATE((a),1);
-- 
GitLab