diff --git a/crypto/sha/asm/sha1-sparcv9.pl b/crypto/sha/asm/sha1-sparcv9.pl index 9f2d15951433e97adc4147611ac312d340e0051e..8306fc88ccf65784a50fe72300baca749d1925e1 100644 --- a/crypto/sha/asm/sha1-sparcv9.pl +++ b/crypto/sha/asm/sha1-sparcv9.pl @@ -8,13 +8,15 @@ # ==================================================================== # Performance improvement is not really impressive on pre-T1 CPU: +8% -# over Sun C and +25% over gcc [3.3]. While on T1, ... And there -# is a gimmick. X[16] vector is packed to 8 64-bit registers and as -# result nothing is spilled on stack. In addition input data is loaded -# in compact instruction sequence, thus minimizing the window when the -# code is subject to [inter-thread] cache-thrashing hazard. The goal -# is to ensure scalability on UltraSPARC T1, or rather to avoid decay -# when amount of active threads exceeds the number of physical cores. +# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it +# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and +# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. +# X[16] vector is packed to 8 64-bit registers and as result nothing +# is spilled on stack. In addition input data is loaded in compact +# instruction sequence, thus minimizing the window when the code is +# subject to [inter-thread] cache-thrashing hazard. The goal is to +# ensure scalability on UltraSPARC T1, or rather to avoid decay when +# amount of active threads exceeds the number of physical cores. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl index bd9afcb1155e01a192b87821b8dc97004a85bbc5..25f80390aca3a52735c49a19d59878c175be82c1 100644 --- a/crypto/sha/asm/sha512-sparcv9.pl +++ b/crypto/sha/asm/sha512-sparcv9.pl @@ -23,7 +23,16 @@ # # SHA512 on UltraSPARC T1. # -# ... +# It's not any faster than 64-bit code generated by Sun C 5.8. This is +# because 64-bit code generator has the advantage of using 64-bit +# loads to access X[16], which I consciously traded for 32-/64-bit ABI +# duality [as per above]. But it surpasses 32-bit Sun C generated code +# by 60%, not to mention that it doesn't suffer from severe decay when +# running 4 times physical cores threads and that it leaves gcc [3.4] +# behind by over 4x factor! If compared to SHA256, single thread +# performance is only 10% better, but overall throughput for maximum +# amount of threads for given CPU exceeds corresponding one of SHA256 +# by 30% [again, optimal coefficient is 50%]. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }