提交 b4b48a10 编写于 作者: A Andy Polyakov

ppc64-mont.pl: adapt for 32-bit and engage for all builds.

上级 7e765bf2
...@@ -135,8 +135,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void"; ...@@ -135,8 +135,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
my $mips3_asm=":bn-mips3.o::::::::::::void"; my $mips3_asm=":bn-mips3.o::::::::::::void";
my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::void"; my $s390x_asm="s390xcpuid.o:bn-s390x.o s390x-mont.o::aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::void";
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void"; my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void";
my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::"; my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::";
my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::"; my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::";
my $no_asm=":::::::::::::void"; my $no_asm=":::::::::::::void";
# As for $BSDthreads. Idea is to maintain "collective" set of flags, # As for $BSDthreads. Idea is to maintain "collective" set of flags,
...@@ -547,7 +547,7 @@ my %table=( ...@@ -547,7 +547,7 @@ my %table=(
##### MacOS X (a.k.a. Rhapsody or Darwin) setup ##### MacOS X (a.k.a. Rhapsody or Darwin) setup
"rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::", "rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::",
"darwin-ppc-cc","cc:-arch ppc -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-ppc-cc","cc:-arch ppc -O3 -DB_ENDIAN -Wa,-force_cpusubtype_ALL::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"darwin64-ppc-cc","cc:-arch ppc64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc64_asm}:osx64:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin64-ppc-cc","cc:-arch ppc64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc64_asm}:osx64:dlfcn:darwin-shared:-fPIC -fno-common:-arch ppc64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"darwin-i386-cc","cc:-arch i386 -O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "darwin-i386-cc","cc:-arch i386 -O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib", "debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
......
...@@ -814,8 +814,8 @@ $thread_cflag = -qthreaded ...@@ -814,8 +814,8 @@ $thread_cflag = -qthreaded
$sys_id = AIX $sys_id = AIX
$lflags = $lflags =
$bn_ops = BN_LLONG RC4_CHAR $bn_ops = BN_LLONG RC4_CHAR
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -845,8 +845,8 @@ $thread_cflag = -pthread ...@@ -845,8 +845,8 @@ $thread_cflag = -pthread
$sys_id = AIX $sys_id = AIX
$lflags = $lflags =
$bn_ops = BN_LLONG RC4_CHAR $bn_ops = BN_LLONG RC4_CHAR
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -907,8 +907,8 @@ $thread_cflag = -qthreaded ...@@ -907,8 +907,8 @@ $thread_cflag = -qthreaded
$sys_id = AIX $sys_id = AIX
$lflags = $lflags =
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o ppc-mont.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -938,8 +938,8 @@ $thread_cflag = -pthread ...@@ -938,8 +938,8 @@ $thread_cflag = -pthread
$sys_id = AIX $sys_id = AIX
$lflags = $lflags =
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o ppc-mont.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -1211,14 +1211,14 @@ $multilib = ...@@ -1211,14 +1211,14 @@ $multilib =
*** darwin-ppc-cc *** darwin-ppc-cc
$cc = cc $cc = cc
$cflags = -arch ppc -O3 -DB_ENDIAN $cflags = -arch ppc -O3 -DB_ENDIAN -Wa,-force_cpusubtype_ALL
$unistd = $unistd =
$thread_cflag = -D_REENTRANT $thread_cflag = -D_REENTRANT
$sys_id = MACOSX $sys_id = MACOSX
$lflags = -Wl,-search_paths_first% $lflags = -Wl,-search_paths_first%
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -1248,8 +1248,8 @@ $thread_cflag = -D_REENTRANT ...@@ -1248,8 +1248,8 @@ $thread_cflag = -D_REENTRANT
$sys_id = MACOSX $sys_id = MACOSX
$lflags = -Wl,-search_paths_first% $lflags = -Wl,-search_paths_first%
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o ppc-mont.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -1682,8 +1682,8 @@ $thread_cflag = -D_REENTRANT ...@@ -1682,8 +1682,8 @@ $thread_cflag = -D_REENTRANT
$sys_id = MACOSX $sys_id = MACOSX
$lflags = $lflags =
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -2327,7 +2327,7 @@ $multilib = ...@@ -2327,7 +2327,7 @@ $multilib =
*** debug-steve32 *** debug-steve32
$cc = gcc $cc = gcc
$cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g -pipe $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m32 -DL_ENDIAN -DCONF_DEBUG -DDEBUG_SAFESTACK -g -pipe
$unistd = $unistd =
$thread_cflag = -D_REENTRANT $thread_cflag = -D_REENTRANT
$sys_id = $sys_id =
...@@ -2358,7 +2358,7 @@ $multilib = ...@@ -2358,7 +2358,7 @@ $multilib =
*** debug-steve64 *** debug-steve64
$cc = gcc $cc = gcc
$cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m64 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -DDEBUG_UNUSED -g -DMD32_REG_T=int $cflags = -Wall -pedantic -DPEDANTIC -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wshadow -Wformat -Werror -DCRYPTO_MDEBUG_ALL -DCRYPTO_MDEBUG_ABORT -DREF_CHECK -DOPENSSL_NO_DEPRECATED -m64 -DL_ENDIAN -DTERMIO -DCONF_DEBUG -DDEBUG_SAFESTACK -g -DMD32_REG_T=int
$unistd = $unistd =
$thread_cflag = -D_REENTRANT $thread_cflag = -D_REENTRANT
$sys_id = $sys_id =
...@@ -3666,8 +3666,8 @@ $thread_cflag = -D_REENTRANT ...@@ -3666,8 +3666,8 @@ $thread_cflag = -D_REENTRANT
$sys_id = $sys_id =
$lflags = -ldl $lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
...@@ -3697,8 +3697,8 @@ $thread_cflag = -D_REENTRANT ...@@ -3697,8 +3697,8 @@ $thread_cflag = -D_REENTRANT
$sys_id = $sys_id =
$lflags = -ldl $lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL
$cpuid_obj = ppccpuid.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o ppc-mont.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o
$bf_obj = $bf_obj =
......
...@@ -103,6 +103,7 @@ pa-risc2.o: asm/pa-risc2.s ...@@ -103,6 +103,7 @@ pa-risc2.o: asm/pa-risc2.s
# ppc - AIX, Linux, MacOS X... # ppc - AIX, Linux, MacOS X...
bn-ppc.s: asm/ppc.pl; $(PERL) asm/ppc.pl $(PERLASM_SCHEME) $@ bn-ppc.s: asm/ppc.pl; $(PERL) asm/ppc.pl $(PERLASM_SCHEME) $@
ppc-mont.s: asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@ ppc-mont.s: asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@
ppc64-mont.s: asm/ppc64-mont.pl;$(PERL) asm/ppc64-mont.pl $(PERLASM_SCHEME) $@
alpha-mont.s: asm/alpha-mont.pl alpha-mont.s: asm/alpha-mont.pl
$(PERL) $< | $(CC) -E - | tee $@ > /dev/null $(PERL) $< | $(CC) -E - | tee $@ > /dev/null
......
...@@ -108,14 +108,19 @@ $code=<<___; ...@@ -108,14 +108,19 @@ $code=<<___;
.machine "any" .machine "any"
.text .text
.globl .bn_mul_mont .globl .bn_mul_mont_int
.align 4 .align 4
.bn_mul_mont: .bn_mul_mont_int:
cmpwi $num,4 cmpwi $num,4
mr $rp,r3 ; $rp is reassigned mr $rp,r3 ; $rp is reassigned
li r3,0 li r3,0
bltlr bltlr
___
$code.=<<___ if ($BNSZ==4);
cmpwi $num,32 ; longer key performance is not better
bgelr
___
$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)` slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096 li $tj,-4096
addi $ovf,$num,`$FRAME+$RZONE` addi $ovf,$num,`$FRAME+$RZONE`
......
...@@ -45,23 +45,41 @@ ...@@ -45,23 +45,41 @@
# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
# in absolute terms, but it's apparently the way Power 6 is... # in absolute terms, but it's apparently the way Power 6 is...
# December 2009
# Adapted for 32-bit build this module delivers 25-120%, more for
# longer keys, performance improvement on 1.8GHz PPC970. However!
# This implementation utilizes even 64-bit integer operations and
# trouble is that most PPC operating systems don't preserve upper
# halves of general purpose registers upong signal delivery. They do
# preserve them upon context switch, but not signalling:-( This means
# that asynchronous signals have to be blocked upon entry to this
# subroutine. Signal masking (and complementary unmasking) has quite
# an impact on performance, naturally larger for shorter keys. It's
# so severe that shorter key performance as low as 1/3 of expected
# one. This is why this routine should be engaged for longer key
# operations only, see crypto/ppccap.c for further details.
# Alternative is to break dependance on upper halves on GPRs...
# MacOS X is an exception from this and doesn't require signal
# masking, and that's where above improvement coefficients were
# collected.
$flavour = shift; $flavour = shift;
if ($flavour =~ /32/) { if ($flavour =~ /32/) {
$SIZE_T=4; $SIZE_T=4;
$RZONE= 224; $RZONE= 224;
$FRAME= $SIZE_T*12+8*12; $FRAME= $SIZE_T*12+8*12;
$fname= "bn_mul_mont_ppc64"; $fname= "bn_mul_mont_fpu64";
$STUX= "stwux"; # store indexed and update $STUX= "stwux"; # store indexed and update
$PUSH= "stw"; $PUSH= "stw";
$POP= "lwz"; $POP= "lwz";
die "not implemented yet";
} elsif ($flavour =~ /64/) { } elsif ($flavour =~ /64/) {
$SIZE_T=8; $SIZE_T=8;
$RZONE= 288; $RZONE= 288;
$FRAME= $SIZE_T*12+8*12; $FRAME= $SIZE_T*12+8*12;
$fname= "bn_mul_mont"; $fname= "bn_mul_mont_fpu64";
# same as above, but 64-bit mnemonics... # same as above, but 64-bit mnemonics...
$STUX= "stdux"; # store indexed and update $STUX= "stdux"; # store indexed and update
...@@ -181,14 +199,14 @@ $code=<<___; ...@@ -181,14 +199,14 @@ $code=<<___;
.globl .$fname .globl .$fname
.align 5 .align 5
.$fname: .$fname:
cmpwi $num,4 cmpwi $num,`3*8/$SIZE_T`
mr $rp,r3 ; $rp is reassigned mr $rp,r3 ; $rp is reassigned
li r3,0 ; possible "not handled" return code li r3,0 ; possible "not handled" return code
bltlr- bltlr-
andi. r0,$num,1 ; $num has to be even andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
bnelr- bnelr-
slwi $num,$num,3 ; num*=8 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
li $i,-4096 li $i,-4096
slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
add $tp,$tp,$num ; place for tp[num+1] add $tp,$tp,$num ; place for tp[num+1]
...@@ -220,11 +238,25 @@ $code=<<___; ...@@ -220,11 +238,25 @@ $code=<<___;
stfd f23,`12*$SIZE_T+72`($sp) stfd f23,`12*$SIZE_T+72`($sp)
stfd f24,`12*$SIZE_T+80`($sp) stfd f24,`12*$SIZE_T+80`($sp)
stfd f25,`12*$SIZE_T+88`($sp) stfd f25,`12*$SIZE_T+88`($sp)
___
$code.=<<___ if ($SIZE_T==8);
ld $a0,0($ap) ; pull ap[0] value ld $a0,0($ap) ; pull ap[0] value
ld $n0,0($n0) ; pull n0[0] value ld $n0,0($n0) ; pull n0[0] value
ld $t3,0($bp) ; bp[0] ld $t3,0($bp) ; bp[0]
___
$code.=<<___ if ($SIZE_T==4);
mr $t1,$n0
lwz $a0,0($ap) ; pull ap[0,1] value
lwz $t0,4($ap)
lwz $n0,0($t1) ; pull n0[0,1] value
lwz $t1,4($t1)
lwz $t3,0($bp) ; bp[0,1]
lwz $t2,4($bp)
insrdi $a0,$t0,32,0
insrdi $n0,$t1,32,0
insrdi $t3,$t2,32,0
___
$code.=<<___;
addi $tp,$sp,`$FRAME+$TRANSFER+8+64` addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
li $i,-64 li $i,-64
add $nap_d,$tp,$num add $nap_d,$tp,$num
...@@ -258,6 +290,8 @@ $code=<<___; ...@@ -258,6 +290,8 @@ $code=<<___;
std $t5,`$FRAME+40`($sp) std $t5,`$FRAME+40`($sp)
std $t6,`$FRAME+48`($sp) std $t6,`$FRAME+48`($sp)
std $t7,`$FRAME+56`($sp) std $t7,`$FRAME+56`($sp)
___
$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap) lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
...@@ -266,6 +300,18 @@ $code=<<___; ...@@ -266,6 +300,18 @@ $code=<<___;
lwz $t5,0($np) lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np) lwz $t7,8($np)
___
$code.=<<___ if ($SIZE_T==4);
lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
lwz $t1,4($ap)
lwz $t2,8($ap)
lwz $t3,12($ap)
lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
lwz $t5,4($np)
lwz $t6,8($np)
lwz $t7,12($np)
___
$code.=<<___;
lfd $ba,`$FRAME+0`($sp) lfd $ba,`$FRAME+0`($sp)
lfd $bb,`$FRAME+8`($sp) lfd $bb,`$FRAME+8`($sp)
lfd $bc,`$FRAME+16`($sp) lfd $bc,`$FRAME+16`($sp)
...@@ -374,6 +420,8 @@ $code=<<___; ...@@ -374,6 +420,8 @@ $code=<<___;
.align 5 .align 5
L1st: L1st:
___
$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap) lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
...@@ -382,6 +430,18 @@ L1st: ...@@ -382,6 +430,18 @@ L1st:
lwz $t5,0($np) lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np) lwz $t7,8($np)
___
$code.=<<___ if ($SIZE_T==4);
lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
lwz $t1,4($ap)
lwz $t2,8($ap)
lwz $t3,12($ap)
lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
lwz $t5,4($np)
lwz $t6,8($np)
lwz $t7,12($np)
___
$code.=<<___;
std $t0,`$FRAME+64`($sp) std $t0,`$FRAME+64`($sp)
std $t1,`$FRAME+72`($sp) std $t1,`$FRAME+72`($sp)
std $t2,`$FRAME+80`($sp) std $t2,`$FRAME+80`($sp)
...@@ -559,7 +619,17 @@ L1st: ...@@ -559,7 +619,17 @@ L1st:
li $i,8 ; i=1 li $i,8 ; i=1
.align 5 .align 5
Louter: Louter:
___
$code.=<<___ if ($SIZE_T==8);
ldx $t3,$bp,$i ; bp[i] ldx $t3,$bp,$i ; bp[i]
___
$code.=<<___ if ($SIZE_T==4);
add $t0,$bp,$i
lwz $t3,0($t0) ; bp[i,i+1]
lwz $t0,4($t0)
insrdi $t3,$t0,32,0
___
$code.=<<___;
ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
mulld $t7,$a0,$t3 ; ap[0]*bp[i] mulld $t7,$a0,$t3 ; ap[0]*bp[i]
...@@ -761,6 +831,13 @@ Linner: ...@@ -761,6 +831,13 @@ Linner:
stfd $T0b,`$FRAME+8`($sp) stfd $T0b,`$FRAME+8`($sp)
add $t7,$t7,$carry add $t7,$t7,$carry
addc $t3,$t0,$t1 addc $t3,$t0,$t1
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t0,$t0,32,0
extrdi $t1,$t1,32,0
adde $t0,$t0,$t1
___
$code.=<<___;
stfd $T1a,`$FRAME+16`($sp) stfd $T1a,`$FRAME+16`($sp)
stfd $T1b,`$FRAME+24`($sp) stfd $T1b,`$FRAME+24`($sp)
insrdi $t4,$t7,16,0 ; 64..127 bits insrdi $t4,$t7,16,0 ; 64..127 bits
...@@ -768,6 +845,13 @@ Linner: ...@@ -768,6 +845,13 @@ Linner:
stfd $T2a,`$FRAME+32`($sp) stfd $T2a,`$FRAME+32`($sp)
stfd $T2b,`$FRAME+40`($sp) stfd $T2b,`$FRAME+40`($sp)
adde $t5,$t4,$t2 adde $t5,$t4,$t2
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t4,$t4,32,0
extrdi $t2,$t2,32,0
adde $t4,$t4,$t2
___
$code.=<<___;
stfd $T3a,`$FRAME+48`($sp) stfd $T3a,`$FRAME+48`($sp)
stfd $T3b,`$FRAME+56`($sp) stfd $T3b,`$FRAME+56`($sp)
addze $carry,$carry addze $carry,$carry
...@@ -816,7 +900,21 @@ Linner: ...@@ -816,7 +900,21 @@ Linner:
ld $t7,`$FRAME+72`($sp) ld $t7,`$FRAME+72`($sp)
addc $t3,$t0,$t1 addc $t3,$t0,$t1
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t0,$t0,32,0
extrdi $t1,$t1,32,0
adde $t0,$t0,$t1
___
$code.=<<___;
adde $t5,$t4,$t2 adde $t5,$t4,$t2
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t4,$t4,32,0
extrdi $t2,$t2,32,0
adde $t4,$t4,$t2
___
$code.=<<___;
addze $carry,$carry addze $carry,$carry
std $t3,-16($tp) ; tp[j-1] std $t3,-16($tp) ; tp[j-1]
...@@ -835,7 +933,9 @@ Linner: ...@@ -835,7 +933,9 @@ Linner:
subf $nap_d,$t7,$nap_d ; rewind pointer subf $nap_d,$t7,$nap_d ; rewind pointer
cmpw $i,$num cmpw $i,$num
blt- Louter blt- Louter
___
$code.=<<___ if ($SIZE_T==8);
subf $np,$num,$np ; rewind np subf $np,$num,$np ; rewind np
addi $j,$j,1 ; restore counter addi $j,$j,1 ; restore counter
subfc $i,$i,$i ; j=0 and "clear" XER[CA] subfc $i,$i,$i ; j=0 and "clear" XER[CA]
...@@ -883,7 +983,74 @@ Lcopy: ; copy or in-place refresh ...@@ -883,7 +983,74 @@ Lcopy: ; copy or in-place refresh
stdx $i,$t4,$i stdx $i,$t4,$i
addi $i,$i,16 addi $i,$i,16
bdnz- Lcopy bdnz- Lcopy
___
$code.=<<___ if ($SIZE_T==4);
subf $np,$num,$np ; rewind np
addi $j,$j,1 ; restore counter
subfc $i,$i,$i ; j=0 and "clear" XER[CA]
addi $tp,$sp,`$FRAME+$TRANSFER`
addi $np,$np,-4
addi $rp,$rp,-4
addi $ap,$sp,`$FRAME+$TRANSFER+4`
mtctr $j
.align 4
Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
ldu $t2,16($tp)
lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
lwz $t5,8($np)
lwz $t6,12($np)
lwzu $t7,16($np)
extrdi $t1,$t0,32,0
extrdi $t3,$t2,32,0
subfe $t4,$t4,$t0 ; tp[j]-np[j]
stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
stw $t1,8($ap)
subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
stw $t2,12($ap)
subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
stwu $t3,16($ap)
stw $t4,4($rp)
stw $t5,8($rp)
stw $t6,12($rp)
stwu $t7,16($rp)
bdnz- Lsub
li $i,0
subfe $ovf,$i,$ovf ; handle upmost overflow bit
addi $tp,$sp,`$FRAME+$TRANSFER+4`
subf $rp,$num,$rp ; rewind rp
and $ap,$tp,$ovf
andc $np,$rp,$ovf
or $ap,$ap,$np ; ap=borrow?tp:rp
addi $tp,$sp,`$FRAME+$TRANSFER`
mtctr $j
.align 4
Lcopy: ; copy or in-place refresh
lwz $t0,4($ap)
lwz $t1,8($ap)
lwz $t2,12($ap)
lwzu $t3,16($ap)
std $i,8($nap_d) ; zap nap_d
std $i,16($nap_d)
std $i,24($nap_d)
std $i,32($nap_d)
std $i,40($nap_d)
std $i,48($nap_d)
std $i,56($nap_d)
stdu $i,64($nap_d)
stw $t0,4($rp)
stw $t1,8($rp)
stw $t2,12($rp)
stwu $t3,16($rp)
std $i,8($tp) ; zap tp at once
stdu $i,16($tp)
bdnz- Lcopy
___
$code.=<<___;
$POP r14,`2*$SIZE_T`($sp) $POP r14,`2*$SIZE_T`($sp)
$POP r15,`3*$SIZE_T`($sp) $POP r15,`3*$SIZE_T`($sp)
$POP r16,`4*$SIZE_T`($sp) $POP r16,`4*$SIZE_T`($sp)
......
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>
#include <signal.h>
#include <openssl/bn.h>
#define PPC_FPU64 (1<<0)
static int OPENSSL_ppccap_P = 0;
static sigset_t all_masked;
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
{
int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
if (sizeof(size_t)==4)
{
#if (defined(__APPLE__) && defined(__MACH__))
if ((OPENSSL_ppccap_P&PPC_FPU64))
return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
#else
/* boundary of 32 was experimentally determined on
Linux 2.6.22, might have to be adjusted on AIX... */
if ((num>=32) && (OPENSSL_ppccap_P&PPC_FPU64))
{
sigset_t oset;
int ret;
sigprocmask(SIG_SETMASK,&all_masked,&oset);
ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
sigprocmask(SIG_SETMASK,&oset,NULL);
return ret;
}
#endif
}
else if ((OPENSSL_ppccap_P&PPC_FPU64))
/* this is a "must" on Power 6, but run-time detection
* is not implemented yet... */
return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
return bn_mul_mont_int(rp,ap,bp,np,n0,num);
}
static sigjmp_buf ill_jmp;
static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
void OPENSSL_cpuid_setup(void)
{
char *e;
sigfillset(&all_masked);
sigdelset(&all_masked,SIGSEGV);
sigdelset(&all_masked,SIGILL);
if ((e=getenv("OPENSSL_ppccap")))
{
OPENSSL_ppccap_P=strtoul(e,NULL,0);
return;
}
if (sizeof(size_t)==4)
{
struct sigaction ill_oact,ill_act;
sigset_t oset;
memset(&ill_act,0,sizeof(ill_act));
ill_act.sa_handler = ill_handler;
sigfillset(&ill_act.sa_mask);
sigdelset(&ill_act.sa_mask,SIGILL);
sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
sigaction (SIGILL,&ill_act,&ill_oact);
if (sigsetjmp(ill_jmp,0) == 0)
{
OPENSSL_ppc64_probe();
OPENSSL_ppccap_P |= PPC_FPU64;
}
else
{
OPENSSL_ppccap_P &= ~PPC_FPU64;
}
sigaction (SIGILL,&ill_oact,NULL);
sigprocmask(SIG_SETMASK,&oset,NULL);
}
}
...@@ -23,9 +23,11 @@ $code=<<___; ...@@ -23,9 +23,11 @@ $code=<<___;
.machine "any" .machine "any"
.text .text
.globl .OPENSSL_cpuid_setup .globl .OPENSSL_ppc64_probe
.align 4 .align 4
.OPENSSL_cpuid_setup: .OPENSSL_ppc64_probe:
fcfid f1,f1
extrdi r0,r0,32,0
blr blr
.globl .OPENSSL_wipe_cpu .globl .OPENSSL_wipe_cpu
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册