From 8e7f966bf3579629ce553f9512ee6952588c02a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ulf=20M=C3=B6ller?= Date: Wed, 5 May 1999 00:23:53 +0000 Subject: [PATCH] SHA-1 cleanups and performance enhancements. Submitted by: Andy Polyakov --- CHANGES | 5 +- Configure | 3 + crypto/opensslconf.h.in | 13 ++-- crypto/sha/sha.h | 26 +++++-- crypto/sha/sha1dgst.c | 153 +++++++++++++++++++++++-------------- crypto/sha/sha_dgst.c | 163 +++++++++++++++++++++++++++------------- crypto/sha/sha_locl.h | 68 ++++++++++++++--- 7 files changed, 298 insertions(+), 133 deletions(-) diff --git a/CHANGES b/CHANGES index 8f18223c71..519dca970c 100644 --- a/CHANGES +++ b/CHANGES @@ -5,8 +5,11 @@ Changes between 0.9.2b and 0.9.3 + *) SHA-1 cleanups and performance enhancements. + [Andy Polyakov ] + *) Sparc v8plus assembler for the bignum library. - [Andy Polyakov ] + [Andy Polyakov ] *) Accept any -xxx and +xxx compiler options in Configure. [Ulf Möller] diff --git a/Configure b/Configure index eff6b1e42b..f4e97b4b9a 100755 --- a/Configure +++ b/Configure @@ -587,6 +587,9 @@ while () { printf OUT "#define RC4_INT unsigned %s\n",$type[$rc4_int]; } elsif (/^#((define)|(undef))\s+RC4_INDEX/) { printf OUT "#%s RC4_INDEX\n",($rc4_idx)?"define":"undef"; } + elsif (/^#(define|undef)\s+I386_ONLY/) + { printf OUT "#%s I386_ONLY\n", ($processor == 386)? + "define":"undef"; } elsif (/^#define\s+MD2_INT\s/) { printf OUT "#define MD2_INT unsigned %s\n",$type[$md2_int]; } elsif (/^#define\s+IDEA_INT\s/) diff --git a/crypto/opensslconf.h.in b/crypto/opensslconf.h.in index cd05361eb8..4e28f3e666 100644 --- a/crypto/opensslconf.h.in +++ b/crypto/opensslconf.h.in @@ -1,6 +1,9 @@ /* crypto/opensslconf.h */ /* WARNING: This file is autogenerated by Configure */ +/* Generate 80386 code? */ +#undef I386_ONLY + #if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR) #define OPENSSLDIR "/usr/local/ssl" #endif @@ -34,7 +37,7 @@ #if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H) #define CONFIG_HEADER_BN_H -#define BN_LLONG +#undef BN_LLONG /* Should we define BN_DIV2W here? */ @@ -53,7 +56,7 @@ #define CONFIG_HEADER_RC4_LOCL_H /* if this is defined data[i] is used instead of *data, this is a %20 * speedup on x86 */ -#define RC4_INDEX +#undef RC4_INDEX #endif #if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H) @@ -67,14 +70,14 @@ /* the following is tweaked from a config script, that is why it is a * protected undef/define */ #ifndef DES_PTR -#define DES_PTR +#undef DES_PTR #endif /* This helps C compiler generate the correct code for multiple functional * units. It reduces register dependancies at the expense of 2 more * registers */ #ifndef DES_RISC1 -#define DES_RISC1 +#undef DES_RISC1 #endif #ifndef DES_RISC2 @@ -88,7 +91,7 @@ YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!! /* Unroll the inner loop, this sometimes helps, sometimes hinders. * Very mucy CPU dependant */ #ifndef DES_UNROLL -#define DES_UNROLL +#undef DES_UNROLL #endif /* These default values were supplied by diff --git a/crypto/sha/sha.h b/crypto/sha/sha.h index ba40aafc13..cd6960ee1a 100644 --- a/crypto/sha/sha.h +++ b/crypto/sha/sha.h @@ -67,18 +67,28 @@ extern "C" { #error SHA is disabled. #endif -#define SHA_CBLOCK 64 -#define SHA_LBLOCK 16 -#define SHA_BLOCK 16 -#define SHA_LAST_BLOCK 56 -#define SHA_LENGTH_BLOCK 8 -#define SHA_DIGEST_LENGTH 20 +/* + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then ! + * ! SHA_LONG_LOG2 has to be defined along. ! + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + */ -#ifdef WIN16 +#if defined(WIN16) || defined(__LP32__) +#define SHA_LONG unsigned long +#elif defined(_CRAY) || defined(__ILP64__) #define SHA_LONG unsigned long +#define SHA_LONG_LOG2 3 #else #define SHA_LONG unsigned int -#endif +#endif + +#define SHA_LBLOCK 16 +#define SHA_CBLOCK (SHA_LBLOCK*4) /* SHA treats input data as a + * contiguous array of 32 bit + * wide big-endian values. */ +#define SHA_LAST_BLOCK (SHA_CBLOCK-8) +#define SHA_DIGEST_LENGTH 20 typedef struct SHAstate_st { diff --git a/crypto/sha/sha1dgst.c b/crypto/sha/sha1dgst.c index f4a47f3768..e867f6972b 100644 --- a/crypto/sha/sha1dgst.c +++ b/crypto/sha/sha1dgst.c @@ -81,14 +81,14 @@ char *SHA1_version="SHA1" OPENSSL_VERSION_PTEXT; #define K_40_59 0x8f1bbcdcUL #define K_60_79 0xca62c1d6UL -# ifdef SHA1_ASM - void sha1_block_x86(SHA_CTX *c, register SHA_LONG *p, int num); -# define sha1_block sha1_block_x86 -# else - void sha1_block(SHA_CTX *c, register SHA_LONG *p, int num); -# endif +#ifdef SHA1_ASM + void sha1_block_x86(SHA_CTX *c, register SHA_LONG *p, int num); +# define sha1_block(c,p,n) sha1_block_x86((c),(p),(n)*SHA_CBLOCK) +#else + static void sha1_block(SHA_CTX *c, register SHA_LONG *p, int num); +#endif -#if defined(L_ENDIAN) && defined(SHA1_ASM) +#if !defined(B_ENDIAN) && defined(SHA1_ASM) # define M_c2nl c2l # define M_p_c2nl p_c2l # define M_c2nl_p c2l_p @@ -147,7 +147,7 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data, } len-=(SHA_CBLOCK-c->num); - sha1_block(c,p,64); + sha1_block(c,p,1); c->num=0; /* drop through and do the rest */ } @@ -184,15 +184,15 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data, * copies it to a local array. I should be able to do this for * the C version as well.... */ -#if 1 +#if SHA_LONG_LOG2==2 #if defined(B_ENDIAN) || defined(SHA1_ASM) if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0) { sw=len/SHA_CBLOCK; if (sw) { - sw*=SHA_CBLOCK; sha1_block(c,(SHA_LONG *)data,sw); + sw*=SHA_CBLOCK; data+=sw; len-=sw; } @@ -204,35 +204,61 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data, p=c->data; while (len >= SHA_CBLOCK) { -#if defined(B_ENDIAN) || defined(L_ENDIAN) +#if SHA_LONG_LOG2==2 +#if defined(B_ENDIAN) || defined(SHA1_ASM) +#define SHA_NO_TAIL_CODE + /* + * Basically we get here only when data happens + * to be unaligned. + */ if (p != (SHA_LONG *)data) memcpy(p,data,SHA_CBLOCK); data+=SHA_CBLOCK; -# ifdef L_ENDIAN -# ifndef SHA1_ASM /* Will not happen */ - for (sw=(SHA_LBLOCK/4); sw; sw--) + sha1_block(c,p=c->data,1); + len-=SHA_CBLOCK; +#else /* little-endian */ +#define BE_COPY(dst,src,i) { \ + l = ((SHA_LONG *)src)[i]; \ + Endian_Reverse32(l); \ + dst[i] = l; \ + } + if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0) { - Endian_Reverse32(p[0]); - Endian_Reverse32(p[1]); - Endian_Reverse32(p[2]); - Endian_Reverse32(p[3]); - p+=4; + for (sw=(SHA_LBLOCK/4); sw; sw--) + { + BE_COPY(p,data,0); + BE_COPY(p,data,1); + BE_COPY(p,data,2); + BE_COPY(p,data,3); + p+=4; + data += 4*sizeof(SHA_LONG); + } + sha1_block(c,p=c->data,1); + len-=SHA_CBLOCK; + continue; } +#endif +#endif +#ifndef SHA_NO_TAIL_CODE + /* + * In addition to "sizeof(SHA_LONG)!= 4" case the + * following code covers unaligned access cases on + * little-endian machines. + * + */ p=c->data; -# endif -# endif -#else - for (sw=(SHA_BLOCK/4); sw; sw--) + for (sw=(SHA_LBLOCK/4); sw; sw--) { - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; + M_c2nl(data,l); p[0]=l; + M_c2nl(data,l); p[1]=l; + M_c2nl(data,l); p[2]=l; + M_c2nl(data,l); p[3]=l; + p+=4; } p=c->data; -#endif - sha1_block(c,p,64); + sha1_block(c,p,1); len-=SHA_CBLOCK; +#endif } ec=(int)len; c->num=ec; @@ -247,26 +273,35 @@ void SHA1_Update(SHA_CTX *c, const register unsigned char *data, void SHA1_Transform(SHA_CTX *c, unsigned char *b) { - SHA_LONG p[16]; -#ifndef B_ENDIAN + SHA_LONG p[SHA_LBLOCK]; SHA_LONG *q; int i; -#endif -#if defined(B_ENDIAN) || defined(L_ENDIAN) - memcpy(p,b,64); -#ifdef L_ENDIAN - q=p; - for (i=(SHA_LBLOCK/4); i; i--) +#if SHA_LONG_LOG2==2 +#if defined(B_ENDIAN) || defined(SHA1_ASM) + memcpy(p,b,SHA_CBLOCK); + sha1_block(c,p,1); + return; +#else + if (((unsigned long)b%sizeof(SHA_LONG)) == 0) { - Endian_Reverse32(q[0]); - Endian_Reverse32(q[1]); - Endian_Reverse32(q[2]); - Endian_Reverse32(q[3]); - q+=4; + q=p; + for (i=(SHA_LBLOCK/4); i; i--) + { + unsigned long l; + BE_COPY(q,b,0); /* BE_COPY was defined above */ + BE_COPY(q,b,1); + BE_COPY(q,b,2); + BE_COPY(q,b,3); + q+=4; + b+=4*sizeof(SHA_LONG); + } + sha1_block(c,p,1); + return; } #endif -#else +#endif +#ifndef SHA_NO_TAIL_CODE /* defined above, see comment */ q=p; for (i=(SHA_LBLOCK/4); i; i--) { @@ -276,16 +311,15 @@ void SHA1_Transform(SHA_CTX *c, unsigned char *b) c2nl(b,l); *(q++)=l; c2nl(b,l); *(q++)=l; } + sha1_block(c,p,1); #endif - sha1_block(c,p,64); } #ifndef SHA1_ASM - -void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num) +static void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num) { register SHA_LONG A,B,C,D,E,T; - SHA_LONG X[16]; + SHA_LONG X[SHA_LBLOCK]; A=c->h0; B=c->h1; @@ -385,8 +419,7 @@ void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num) c->h3=(c->h3+B)&0xffffffffL; c->h4=(c->h4+C)&0xffffffffL; - num-=64; - if (num <= 0) break; + if (--num <= 0) break; A=c->h0; B=c->h1; @@ -394,7 +427,12 @@ void sha1_block(SHA_CTX *c, register SHA_LONG *W, int num) D=c->h3; E=c->h4; - W+=16; + W+=SHA_LBLOCK; /* Note! This can happen only when sizeof(SHA_LONG) + * is 4. Whenever it's not the actual case this + * function is never called with num larger than 1 + * and we never advance down here. + * + */ } } #endif @@ -423,18 +461,20 @@ void SHA1_Final(unsigned char *md, SHA_CTX *c) { for (; iNh; p[SHA_LBLOCK-1]=c->Nl; -#if defined(L_ENDIAN) && defined(SHA1_ASM) +#if SHA_LONG_LOG2==2 +#if !defined(B_ENDIAN) && defined(SHA1_ASM) Endian_Reverse32(p[SHA_LBLOCK-2]); Endian_Reverse32(p[SHA_LBLOCK-1]); #endif - sha1_block(c,p,64); +#endif + sha1_block(c,p,1); cp=md; l=c->h0; nl2c(l,cp); l=c->h1; nl2c(l,cp); @@ -442,10 +482,11 @@ void SHA1_Final(unsigned char *md, SHA_CTX *c) l=c->h3; nl2c(l,cp); l=c->h4; nl2c(l,cp); - /* clear stuff, sha1_block may be leaving some stuff on the stack - * but I'm not worried :-) */ c->num=0; -/* memset((char *)&c,0,sizeof(c));*/ + /* sha_block may be leaving some stuff on the stack + * but I'm not worried :-) + memset((void *)c,0,sizeof(SHA_CTX)); + */ } #endif diff --git a/crypto/sha/sha_dgst.c b/crypto/sha/sha_dgst.c index 5827c73cea..d90f497763 100644 --- a/crypto/sha/sha_dgst.c +++ b/crypto/sha/sha_dgst.c @@ -81,12 +81,21 @@ char *SHA_version="SHA" OPENSSL_VERSION_PTEXT; #define K_40_59 0x8f1bbcdcUL #define K_60_79 0xca62c1d6UL - void sha_block(SHA_CTX *c, register SHA_LONG *p, int num); -#define M_c2nl c2nl -#define M_p_c2nl p_c2nl -#define M_c2nl_p c2nl_p -#define M_p_c2nl_p p_c2nl_p -#define M_nl2c nl2c +static void sha_block(SHA_CTX *c, register SHA_LONG *p, int num); + +#if !defined(B_ENDIAN) && defined(SHA_ASM) +# define M_c2nl c2l +# define M_p_c2nl p_c2l +# define M_c2nl_p c2l_p +# define M_p_c2nl_p p_c2l_p +# define M_nl2c l2c +#else +# define M_c2nl c2nl +# define M_p_c2nl p_c2nl +# define M_c2nl_p c2nl_p +# define M_p_c2nl_p p_c2nl_p +# define M_nl2c nl2c +#endif void SHA_Init(SHA_CTX *c) { @@ -133,7 +142,7 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data, } len-=(SHA_CBLOCK-c->num); - sha_block(c,p,64); + sha_block(c,p,1); c->num=0; /* drop through and do the rest */ } @@ -170,15 +179,15 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data, * copies it to a local array. I should be able to do this for * the C version as well.... */ -#if 1 +#if SHA_LONG_LOG2==2 #if defined(B_ENDIAN) || defined(SHA_ASM) if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0) { sw=len/SHA_CBLOCK; if (sw) { - sw*=SHA_CBLOCK; sha_block(c,(SHA_LONG *)data,sw); + sw*=SHA_CBLOCK; data+=sw; len-=sw; } @@ -190,35 +199,61 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data, p=c->data; while (len >= SHA_CBLOCK) { -#if defined(B_ENDIAN) || defined(L_ENDIAN) +#if SHA_LONG_LOG2==2 +#if defined(B_ENDIAN) || defined(SHA_ASM) +#define SHA_NO_TAIL_CODE + /* + * Basically we get here only when data happens + * to be unaligned. + */ if (p != (SHA_LONG *)data) memcpy(p,data,SHA_CBLOCK); data+=SHA_CBLOCK; -# ifdef L_ENDIAN -# ifndef SHA_ASM /* Will not happen */ - for (sw=(SHA_LBLOCK/4); sw; sw--) + sha_block(c,p=c->data,1); + len-=SHA_CBLOCK; +#else /* little-endian */ +#define BE_COPY(dst,src,i) { \ + l = ((SHA_LONG *)src)[i]; \ + Endian_Reverse32(l); \ + dst[i] = l; \ + } + if ((((unsigned long)data)%sizeof(SHA_LONG)) == 0) { - Endian_Reverse32(p[0]); - Endian_Reverse32(p[1]); - Endian_Reverse32(p[2]); - Endian_Reverse32(p[3]); - p+=4; + for (sw=(SHA_LBLOCK/4); sw; sw--) + { + BE_COPY(p,data,0); + BE_COPY(p,data,1); + BE_COPY(p,data,2); + BE_COPY(p,data,3); + p+=4; + data += 4*sizeof(SHA_LONG); + } + sha_block(c,p=c->data,1); + len-=SHA_CBLOCK; + continue; } +#endif +#endif +#ifndef SHA_NO_TAIL_CODE + /* + * In addition to "sizeof(SHA_LONG)!= 4" case the + * following code covers unaligned access cases on + * little-endian machines. + * + */ p=c->data; -# endif -# endif -#else - for (sw=(SHA_BLOCK/4); sw; sw--) + for (sw=(SHA_LBLOCK/4); sw; sw--) { - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; + M_c2nl(data,l); p[0]=l; + M_c2nl(data,l); p[1]=l; + M_c2nl(data,l); p[2]=l; + M_c2nl(data,l); p[3]=l; + p+=4; } p=c->data; -#endif - sha_block(c,p,64); + sha_block(c,p,1); len-=SHA_CBLOCK; +#endif } ec=(int)len; c->num=ec; @@ -233,26 +268,35 @@ void SHA_Update(SHA_CTX *c, const register unsigned char *data, void SHA_Transform(SHA_CTX *c, unsigned char *b) { - SHA_LONG p[16]; -#if !defined(B_ENDIAN) + SHA_LONG p[SHA_LBLOCK]; SHA_LONG *q; int i; -#endif -#if defined(B_ENDIAN) || defined(L_ENDIAN) - memcpy(p,b,64); -#ifdef L_ENDIAN - q=p; - for (i=(SHA_LBLOCK/4); i; i--) +#if SHA_LONG_LOG2==2 +#if defined(B_ENDIAN) || defined(SHA_ASM) + memcpy(p,b,SHA_CBLOCK); + sha_block(c,p,1); + return; +#else + if (((unsigned long)b%sizeof(SHA_LONG)) == 0) { - Endian_Reverse32(q[0]); - Endian_Reverse32(q[1]); - Endian_Reverse32(q[2]); - Endian_Reverse32(q[3]); - q+=4; + q=p; + for (i=(SHA_LBLOCK/4); i; i--) + { + unsigned long l; + BE_COPY(q,b,0); /* BE_COPY was defined above */ + BE_COPY(q,b,1); + BE_COPY(q,b,2); + BE_COPY(q,b,3); + q+=4; + b+=4*sizeof(SHA_LONG); + } + sha_block(c,p,1); + return; } #endif -#else +#endif +#ifndef SHA_NO_TAIL_CODE /* defined above, see comment */ q=p; for (i=(SHA_LBLOCK/4); i; i--) { @@ -262,14 +306,15 @@ void SHA_Transform(SHA_CTX *c, unsigned char *b) c2nl(b,l); *(q++)=l; c2nl(b,l); *(q++)=l; } + sha_block(c,p,1); #endif - sha_block(c,p,64); } -void sha_block(SHA_CTX *c, register SHA_LONG *W, int num) +#ifndef SHA_ASM +static void sha_block(SHA_CTX *c, register SHA_LONG *W, int num) { register SHA_LONG A,B,C,D,E,T; - SHA_LONG X[16]; + SHA_LONG X[SHA_LBLOCK]; A=c->h0; B=c->h1; @@ -369,8 +414,7 @@ void sha_block(SHA_CTX *c, register SHA_LONG *W, int num) c->h3=(c->h3+B)&0xffffffffL; c->h4=(c->h4+C)&0xffffffffL; - num-=64; - if (num <= 0) break; + if (--num <= 0) break; A=c->h0; B=c->h1; @@ -378,9 +422,15 @@ void sha_block(SHA_CTX *c, register SHA_LONG *W, int num) D=c->h3; E=c->h4; - W+=16; + W+=SHA_LBLOCK; /* Note! This can happen only when sizeof(SHA_LONG) + * is 4. Whenever it's not the actual case this + * function is never called with num larger than 1 + * and we never advance down here. + * + */ } } +#endif void SHA_Final(unsigned char *md, SHA_CTX *c) { @@ -406,14 +456,20 @@ void SHA_Final(unsigned char *md, SHA_CTX *c) { for (; iNh; p[SHA_LBLOCK-1]=c->Nl; - sha_block(c,p,64); +#if SHA_LONG_LOG2==2 +#if !defined(B_ENDIAN) && defined(SHA_ASM) + Endian_Reverse32(p[SHA_LBLOCK-2]); + Endian_Reverse32(p[SHA_LBLOCK-1]); +#endif +#endif + sha_block(c,p,1); cp=md; l=c->h0; nl2c(l,cp); l=c->h1; nl2c(l,cp); @@ -421,9 +477,10 @@ void SHA_Final(unsigned char *md, SHA_CTX *c) l=c->h3; nl2c(l,cp); l=c->h4; nl2c(l,cp); - /* clear stuff, sha_block may be leaving some stuff on the stack - * but I'm not worried :-) */ c->num=0; -/* memset((char *)&c,0,sizeof(c));*/ + /* sha_block may be leaving some stuff on the stack + * but I'm not worried :-) + memset((void *)c,0,sizeof(SHA_CTX)); + */ } #endif diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h index 9f1251e787..32bbe30afd 100644 --- a/crypto/sha/sha_locl.h +++ b/crypto/sha/sha_locl.h @@ -158,30 +158,79 @@ *((c)++)=(unsigned char)(((l)>>16)&0xff), \ *((c)++)=(unsigned char)(((l)>>24)&0xff)) +#ifndef SHA_LONG_LOG2 +#define SHA_LONG_LOG2 2 /* default to 32 bits */ +#endif + #undef ROTATE +#undef Endian_Reverse32 #if defined(WIN32) #define ROTATE(a,n) _lrotl(a,n) -#else -#define ROTATE(a,n) (((a)<<(n))|(((a)&0xffffffff)>>(32-(n)))) +#elif defined(__GNUC__) +/* some inline assembler templates by */ +#if defined(__i386) +#define ROTATE(a,n) ({ register unsigned int ret; \ + asm ("roll %1,%0" \ + : "=r"(ret) \ + : "I"(n), "0"(a) \ + : "cc"); \ + ret; \ + }) +#ifndef I386_ONLY +#define Endian_Reverse32(a) \ + { register unsigned int l=(a); \ + asm ("bswapl %0" \ + : "=r"(l) : "0"(l)); \ + (a)=l; \ + } +#endif +#elif defined(__powerpc) +#define ROTATE(a,n) ({ register unsigned int ret; \ + asm ("rlwinm %0,%1,%2,0,31" \ + : "=r"(ret) \ + : "r"(a), "I"(n)); \ + ret; \ + }) +/* Endian_Reverse32 is not needed for PowerPC */ +#endif #endif /* A nice byte order reversal from Wei Dai */ -#if defined(WIN32) +#ifdef ROTATE +#ifndef Endian_Reverse32 /* 5 instructions with rotate instruction, else 9 */ #define Endian_Reverse32(a) \ { \ - unsigned long l=(a); \ - (a)=((ROTATE(l,8)&0x00FF00FF)|(ROTATE(l,24)&0xFF00FF00)); \ + unsigned long t=(a); \ + (a)=((ROTATE(t,8)&0x00FF00FF)|(ROTATE((t&0x00FF00FF),24))); \ } +#endif #else +#define ROTATE(a,n) (((a)<<(n))|(((a)&0xffffffff)>>(32-(n)))) +#ifndef Endian_Reverse32 /* 6 instructions with rotate instruction, else 8 */ #define Endian_Reverse32(a) \ { \ - unsigned long l=(a); \ - l=(((l&0xFF00FF00)>>8L)|((l&0x00FF00FF)<<8L)); \ - (a)=ROTATE(l,16L); \ + unsigned long t=(a); \ + t=(((t>>8)&0x00FF00FF)|((t&0x00FF00FF)<<8)); \ + (a)=ROTATE(t,16); \ } #endif +/* + * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|... + * It's rewritten as above for two reasons: + * - RISCs aren't good at long constants and have to explicitely + * compose 'em with several (well, usually 2) instructions in a + * register before performing the actual operation and (as you + * already realized:-) having same constant should inspire the + * compiler to permanently allocate the only register for it; + * - most modern CPUs have two ALUs, but usually only one has + * circuitry for shifts:-( this minor tweak inspires compiler + * to schedule shift instructions in a better way... + * + * + */ +#endif /* As pointed out by Wei Dai , F() below can be * simplified to the code in F_00_19. Wei attributes these optimisations @@ -195,13 +244,12 @@ #define F_40_59(b,c,d) (((b) & (c)) | (((b)|(c)) & (d))) #define F_60_79(b,c,d) F_20_39(b,c,d) -#ifdef SHA_0 #undef Xupdate +#ifdef SHA_0 #define Xupdate(a,i,ia,ib,ic,id) X[(i)&0x0f]=(a)=\ (ia[(i)&0x0f]^ib[((i)+2)&0x0f]^ic[((i)+8)&0x0f]^id[((i)+13)&0x0f]); #endif #ifdef SHA_1 -#undef Xupdate #define Xupdate(a,i,ia,ib,ic,id) (a)=\ (ia[(i)&0x0f]^ib[((i)+2)&0x0f]^ic[((i)+8)&0x0f]^id[((i)+13)&0x0f]);\ X[(i)&0x0f]=(a)=ROTATE((a),1); -- GitLab