提交 67150340 编写于 作者: A Andy Polyakov

PPC assembler pack: adhere closer to ABI specs, add PowerOpen traceback data.

上级 0ca9a483
......@@ -18,7 +18,7 @@
# February 2010
#
# Rescheduling instructions to favour Power6 pipeline gives 10%
# Rescheduling instructions to favour Power6 pipeline gave 10%
# performance improvement on the platfrom in question (and marginal
# improvement even on others). It should be noted that Power6 fails
# to process byte in 18 cycles, only in 23, because it fails to issue
......@@ -33,11 +33,13 @@ $flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
......@@ -116,15 +118,19 @@ LAES_Te:
addi $Tbl0,$Tbl0,`128-8`
mtlr r0
blr
.space `32-24`
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
LAES_Td:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
addi $Tbl0,$Tbl0,`128-8-32+2048+256`
addi $Tbl0,$Tbl0,`128-64-8+2048+256`
mtlr r0
blr
.space `128-32-24`
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `128-64-9*4`
___
&_data_word(
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
......@@ -328,10 +334,9 @@ $code.=<<___;
.globl .AES_encrypt
.align 7
.AES_encrypt:
mflr r0
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
......@@ -352,6 +357,7 @@ $code.=<<___;
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $s0,0($inp)
lwz $s1,4($inp)
......@@ -364,7 +370,7 @@ $code.=<<___;
stw $s2,8($out)
stw $s3,12($out)
$POP r0,`$FRAME-$SIZE_T*21`($sp)
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
......@@ -388,6 +394,9 @@ $code.=<<___;
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
.align 5
Lppc_AES_encrypt:
......@@ -530,6 +539,8 @@ Lenc_loop:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_encrypt_compact:
......@@ -673,14 +684,15 @@ Lenc_compact_done:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .AES_decrypt
.align 7
.AES_decrypt:
mflr r0
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
......@@ -701,6 +713,7 @@ Lenc_compact_done:
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $s0,0($inp)
lwz $s1,4($inp)
......@@ -713,7 +726,7 @@ Lenc_compact_done:
stw $s2,8($out)
stw $s3,12($out)
$POP r0,`$FRAME-$SIZE_T*21`($sp)
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
......@@ -737,6 +750,9 @@ Lenc_compact_done:
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
.align 5
Lppc_AES_decrypt:
......@@ -879,6 +895,8 @@ Ldec_loop:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_decrypt_compact:
......@@ -1179,7 +1197,9 @@ Ldec_compact_done:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.long 0
.byte 0,12,0x14,0,0,0,0,0
.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
.align 7
___
......
......@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
$FRAME= $SIZE_T*16;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
......@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
$FRAME= $SIZE_T*16;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
......@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
$POP= $LD;
} else { die "nonsense $flavour"; }
$FRAME=8*$SIZE_T+$RZONE;
$LOCALS=8*$SIZE_T;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
......@@ -89,18 +90,18 @@ $aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
$i="r14";
$j="r15";
$tp="r16";
$m0="r17";
$m1="r18";
$lo0="r19";
$hi0="r20";
$lo1="r21";
$hi1="r22";
$alo="r23";
$ahi="r24";
$nlo="r25";
$i="r20";
$j="r21";
$tp="r22";
$m0="r23";
$m1="r24";
$lo0="r25";
$hi0="r26";
$lo1="r27";
$hi1="r28";
$alo="r29";
$ahi="r30";
$nlo="r31";
#
$nhi="r0";
......@@ -123,32 +124,33 @@ ___
$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,`$FRAME+$RZONE`
addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
$PUSH r14,`4*$SIZE_T`($sp)
$PUSH r15,`5*$SIZE_T`($sp)
$PUSH r16,`6*$SIZE_T`($sp)
$PUSH r17,`7*$SIZE_T`($sp)
$PUSH r18,`8*$SIZE_T`($sp)
$PUSH r19,`9*$SIZE_T`($sp)
$PUSH r20,`10*$SIZE_T`($sp)
$PUSH r21,`11*$SIZE_T`($sp)
$PUSH r22,`12*$SIZE_T`($sp)
$PUSH r23,`13*$SIZE_T`($sp)
$PUSH r24,`14*$SIZE_T`($sp)
$PUSH r25,`15*$SIZE_T`($sp)
$PUSH r20,`-12*$SIZE_T`($tj)
$PUSH r21,`-11*$SIZE_T`($tj)
$PUSH r22,`-10*$SIZE_T`($tj)
$PUSH r23,`-9*$SIZE_T`($tj)
$PUSH r24,`-8*$SIZE_T`($tj)
$PUSH r25,`-7*$SIZE_T`($tj)
$PUSH r26,`-6*$SIZE_T`($tj)
$PUSH r27,`-5*$SIZE_T`($tj)
$PUSH r28,`-4*$SIZE_T`($tj)
$PUSH r29,`-3*$SIZE_T`($tj)
$PUSH r30,`-2*$SIZE_T`($tj)
$PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$FRAME
addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
......@@ -210,8 +212,8 @@ L1st:
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$FRAME
$LD $tj,$FRAME($sp) ; tp[0]
addi $tp,$sp,$LOCALS
$LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
......@@ -278,7 +280,7 @@ Linner:
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
addi $tp,$sp,$LOCALS
mtctr $num
.align 4
......@@ -304,23 +306,27 @@ Lcopy: ; copy or in-place refresh
addi $j,$j,$BNSZ
bdnz- Lcopy
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
$POP r17,`7*$SIZE_T`($sp)
$POP r18,`8*$SIZE_T`($sp)
$POP r19,`9*$SIZE_T`($sp)
$POP r20,`10*$SIZE_T`($sp)
$POP r21,`11*$SIZE_T`($sp)
$POP r22,`12*$SIZE_T`($sp)
$POP r23,`13*$SIZE_T`($sp)
$POP r24,`14*$SIZE_T`($sp)
$POP r25,`15*$SIZE_T`($sp)
$POP $sp,0($sp)
$POP $tj,0($sp)
li r3,1
$POP r20,`-12*$SIZE_T`($tj)
$POP r21,`-11*$SIZE_T`($tj)
$POP r22,`-10*$SIZE_T`($tj)
$POP r23,`-9*$SIZE_T`($tj)
$POP r24,`-8*$SIZE_T`($tj)
$POP r25,`-7*$SIZE_T`($tj)
$POP r26,`-6*$SIZE_T`($tj)
$POP r27,`-5*$SIZE_T`($tj)
$POP r28,`-4*$SIZE_T`($tj)
$POP r29,`-3*$SIZE_T`($tj)
$POP r30,`-2*$SIZE_T`($tj)
$POP r31,`-1*$SIZE_T`($tj)
mr $sp,$tj
blr
.long 0
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
.byte 0,12,4,0,0x80,12,6,0
.long 0
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
......
......@@ -389,7 +389,9 @@ $data=<<EOF;
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -814,8 +816,9 @@ $data=<<EOF;
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -966,7 +969,9 @@ $data=<<EOF;
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -1502,7 +1507,9 @@ $data=<<EOF;
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -1707,7 +1717,9 @@ Lppcasm_div8:
Lppcasm_div9:
or r3,r8,r0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
Lppcasm_mw_OVER:
addi r3,r12,0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
#
# NOTE: The following label name should be changed to
......@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
Lppcasm_maw_adios:
addi r3,r12,0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.align 4
EOF
$data =~ s/\`([^\`]*)\`/eval $1/gem;
......
......@@ -70,7 +70,6 @@ $flavour = shift;
if ($flavour =~ /32/) {
$SIZE_T=4;
$RZONE= 224;
$FRAME= $SIZE_T*12+8*12;
$fname= "bn_mul_mont_fpu64";
$STUX= "stwux"; # store indexed and update
......@@ -79,7 +78,6 @@ if ($flavour =~ /32/) {
} elsif ($flavour =~ /64/) {
$SIZE_T=8;
$RZONE= 288;
$FRAME= $SIZE_T*12+8*12;
$fname= "bn_mul_mont_fpu64";
# same as above, but 64-bit mnemonics...
......@@ -95,7 +93,7 @@ die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=($FRAME+63)&~63;
$FRAME=64; # padded frame header
$TRANSFER=16*8;
$carry="r0";
......@@ -112,16 +110,16 @@ $tp="r10";
$j="r11";
$i="r12";
# non-volatile registers
$nap_d="r14"; # interleaved ap and np in double format
$a0="r15"; # ap[0]
$t0="r16"; # temporary registers
$t1="r17";
$t2="r18";
$t3="r19";
$t4="r20";
$t5="r21";
$t6="r22";
$t7="r23";
$nap_d="r22"; # interleaved ap and np in double format
$a0="r23"; # ap[0]
$t0="r24"; # temporary registers
$t1="r25";
$t2="r26";
$t3="r27";
$t4="r28";
$t5="r29";
$t6="r30";
$t7="r31";
# PPC offers enough register bank capacity to unroll inner loops twice
#
......@@ -151,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
$dota="f8"; $dotb="f9";
$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17";
$T0a="f18"; $T0b="f19";
$T1a="f20"; $T1b="f21";
$T2a="f22"; $T2b="f23";
$T3a="f24"; $T3b="f25";
$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
$T0a="f24"; $T0b="f25";
$T1a="f26"; $T1b="f27";
$T2a="f28"; $T2b="f29";
$T3a="f30"; $T3b="f31";
# sp----------->+-------------------------------+
# | saved sp |
# +-------------------------------+
# | |
# +-------------------------------+
# | 10 saved gpr, r14-r23 |
# . .
# . .
# +12*size_t +-------------------------------+
# | 12 saved fpr, f14-f25 |
# . .
# . .
# +12*8 +-------------------------------+
# | padding to 64 byte boundary |
# . .
# +X +-------------------------------+
# +64 +-------------------------------+
# | 16 gpr<->fpr transfer zone |
# . .
# . .
......@@ -192,6 +179,16 @@ $T3a="f24"; $T3b="f25";
# . .
# . .
# +-------------------------------+
# . .
# -12*size_t +-------------------------------+
# | 10 saved gpr, r22-r31 |
# . .
# . .
# -12*8 +-------------------------------+
# | 12 saved fpr, f20-f31 |
# . .
# . .
# +-------------------------------+
$code=<<___;
.machine "any"
......@@ -215,30 +212,31 @@ $code=<<___;
subf $tp,$tp,$sp ; $sp-$tp
and $tp,$tp,$i ; minimize TLB usage
subf $tp,$sp,$tp ; $tp-$sp
mr $i,$sp
$STUX $sp,$sp,$tp ; alloca
$PUSH r14,`2*$SIZE_T`($sp)
$PUSH r15,`3*$SIZE_T`($sp)
$PUSH r16,`4*$SIZE_T`($sp)
$PUSH r17,`5*$SIZE_T`($sp)
$PUSH r18,`6*$SIZE_T`($sp)
$PUSH r19,`7*$SIZE_T`($sp)
$PUSH r20,`8*$SIZE_T`($sp)
$PUSH r21,`9*$SIZE_T`($sp)
$PUSH r22,`10*$SIZE_T`($sp)
$PUSH r23,`11*$SIZE_T`($sp)
stfd f14,`12*$SIZE_T+0`($sp)
stfd f15,`12*$SIZE_T+8`($sp)
stfd f16,`12*$SIZE_T+16`($sp)
stfd f17,`12*$SIZE_T+24`($sp)
stfd f18,`12*$SIZE_T+32`($sp)
stfd f19,`12*$SIZE_T+40`($sp)
stfd f20,`12*$SIZE_T+48`($sp)
stfd f21,`12*$SIZE_T+56`($sp)
stfd f22,`12*$SIZE_T+64`($sp)
stfd f23,`12*$SIZE_T+72`($sp)
stfd f24,`12*$SIZE_T+80`($sp)
stfd f25,`12*$SIZE_T+88`($sp)
$PUSH r22,`-12*8-10*$SIZE_T`($i)
$PUSH r23,`-12*8-9*$SIZE_T`($i)
$PUSH r24,`-12*8-8*$SIZE_T`($i)
$PUSH r25,`-12*8-7*$SIZE_T`($i)
$PUSH r26,`-12*8-6*$SIZE_T`($i)
$PUSH r27,`-12*8-5*$SIZE_T`($i)
$PUSH r28,`-12*8-4*$SIZE_T`($i)
$PUSH r29,`-12*8-3*$SIZE_T`($i)
$PUSH r30,`-12*8-2*$SIZE_T`($i)
$PUSH r31,`-12*8-1*$SIZE_T`($i)
stfd f20,`-12*8`($i)
stfd f21,`-11*8`($i)
stfd f22,`-10*8`($i)
stfd f23,`-9*8`($i)
stfd f24,`-8*8`($i)
stfd f25,`-7*8`($i)
stfd f26,`-6*8`($i)
stfd f27,`-5*8`($i)
stfd f28,`-4*8`($i)
stfd f29,`-3*8`($i)
stfd f30,`-2*8`($i)
stfd f31,`-1*8`($i)
___
$code.=<<___ if ($SIZE_T==8);
ld $a0,0($ap) ; pull ap[0] value
......@@ -1052,33 +1050,37 @@ Lcopy: ; copy or in-place refresh
___
$code.=<<___;
$POP r14,`2*$SIZE_T`($sp)
$POP r15,`3*$SIZE_T`($sp)
$POP r16,`4*$SIZE_T`($sp)
$POP r17,`5*$SIZE_T`($sp)
$POP r18,`6*$SIZE_T`($sp)
$POP r19,`7*$SIZE_T`($sp)
$POP r20,`8*$SIZE_T`($sp)
$POP r21,`9*$SIZE_T`($sp)
$POP r22,`10*$SIZE_T`($sp)
$POP r23,`11*$SIZE_T`($sp)
lfd f14,`12*$SIZE_T+0`($sp)
lfd f15,`12*$SIZE_T+8`($sp)
lfd f16,`12*$SIZE_T+16`($sp)
lfd f17,`12*$SIZE_T+24`($sp)
lfd f18,`12*$SIZE_T+32`($sp)
lfd f19,`12*$SIZE_T+40`($sp)
lfd f20,`12*$SIZE_T+48`($sp)
lfd f21,`12*$SIZE_T+56`($sp)
lfd f22,`12*$SIZE_T+64`($sp)
lfd f23,`12*$SIZE_T+72`($sp)
lfd f24,`12*$SIZE_T+80`($sp)
lfd f25,`12*$SIZE_T+88`($sp)
$POP $sp,0($sp)
$POP $i,0($sp)
li r3,1 ; signal "handled"
$POP r22,`-12*8-10*$SIZE_T`($i)
$POP r23,`-12*8-9*$SIZE_T`($i)
$POP r24,`-12*8-8*$SIZE_T`($i)
$POP r25,`-12*8-7*$SIZE_T`($i)
$POP r26,`-12*8-6*$SIZE_T`($i)
$POP r27,`-12*8-5*$SIZE_T`($i)
$POP r28,`-12*8-4*$SIZE_T`($i)
$POP r29,`-12*8-3*$SIZE_T`($i)
$POP r30,`-12*8-2*$SIZE_T`($i)
$POP r31,`-12*8-1*$SIZE_T`($i)
lfd f20,`-12*8`($i)
lfd f21,`-11*8`($i)
lfd f22,`-10*8`($i)
lfd f23,`-9*8`($i)
lfd f24,`-8*8`($i)
lfd f25,`-7*8`($i)
lfd f26,`-6*8`($i)
lfd f27,`-5*8`($i)
lfd f28,`-4*8`($i)
lfd f29,`-3*8`($i)
lfd f30,`-2*8`($i)
lfd f31,`-1*8`($i)
mr $sp,$i
blr
.long 0
.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
.byte 0,12,4,0,0x8c,10,6,0
.long 0
.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
......
......@@ -29,12 +29,16 @@ $code=<<___;
fcfid f1,f1
extrdi r0,r0,32,0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_altivec_probe
.align 4
.OPENSSL_altivec_probe:
.long 0x10000484 # vor v0,v0,v0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_wipe_cpu
.align 4
......@@ -65,6 +69,8 @@ $code=<<___;
fmr f12,f31
fmr f13,f31
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_atomic_add
.align 4
......@@ -75,6 +81,9 @@ Ladd: lwarx r5,0,r3
bne- Ladd
$SIGNX r3,r0
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.globl .OPENSSL_rdtsc
.align 4
......@@ -82,6 +91,8 @@ Ladd: lwarx r5,0,r3
mftb r3
mftbu r4
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_cleanse
.align 4
......@@ -111,6 +122,9 @@ Laligned:
andi. r4,r4,3
bne Little
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
___
{
my ($out,$cnt,$max)=("r3","r4","r5");
......@@ -145,6 +159,9 @@ Loop: mftb $tick
mr r3,$cnt
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.globl .OPENSSL_instrument_bus2
.align 4
......@@ -193,6 +210,9 @@ Ldone2:
srwi $cnt,$cnt,2
sub r3,r0,$cnt
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
___
}
......
......@@ -24,12 +24,14 @@ $flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
......@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T;
$FRAME=24*$SIZE_T+64;
$LOCALS=6*$SIZE_T;
$K ="r0";
$sp ="r1";
......@@ -162,9 +165,8 @@ $code=<<___;
.globl .sha1_block_data_order
.align 4
.sha1_block_data_order:
$STU $sp,-$FRAME($sp)
mflr r0
$STU $sp,`-($FRAME+64)`($sp)
$PUSH r0,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
......@@ -182,6 +184,7 @@ $code=<<___;
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $A,0($ctx)
lwz $B,4($ctx)
lwz $C,8($ctx)
......@@ -192,37 +195,14 @@ $code=<<___;
Laligned:
mtctr $num
bl Lsha1_block_private
Ldone:
$POP r0,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,`$FRAME+64`
blr
___
b Ldone
# PowerPC specification allows an implementation to be ill-behaved
# upon unaligned access which crosses page boundary. "Better safe
# than sorry" principle makes me treat it specially. But I don't
# look for particular offending word, but rather for 64-byte input
# block which crosses the boundary. Once found that block is aligned
# and hashed separately...
$code.=<<___;
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for 64-byte input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
......@@ -237,7 +217,7 @@ Lunaligned:
Lcross_page:
li $t1,16
mtctr $t1
addi r20,$sp,$FRAME ; spot below the frame
addi r20,$sp,$LOCALS ; spot within the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
......@@ -251,15 +231,40 @@ Lmemcpy:
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
$PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
li $t1,1
addi $inp,$sp,$FRAME
addi $inp,$sp,$LOCALS
mtctr $t1
bl Lsha1_block_private
$POP $inp,`$FRAME-$SIZE_T*19`($sp)
$POP $inp,`$FRAME-$SIZE_T*18`($sp)
addic. $num,$num,-1
bne- Lunaligned
b Ldone
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
___
# This is private block function, which uses tailored calling
......@@ -309,6 +314,8 @@ $code.=<<___;
addi $inp,$inp,`16*4`
bdnz- Lsha1_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
$code.=<<___;
.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
......
......@@ -40,6 +40,7 @@ $output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$UCMP="cmpld";
$SHL="sldi";
......@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$UCMP="cmplw";
$SHL="slwi";
......@@ -87,7 +89,8 @@ if ($output =~ /512/) {
$SHR="srwi";
}
$FRAME=32*$SIZE_T;
$FRAME=32*$SIZE_T+16*$SZ;
$LOCALS=6*$SIZE_T;
$sp ="r1";
$toc="r2";
......@@ -179,13 +182,12 @@ $code=<<___;
.globl $func
.align 6
$func:
$STU $sp,-$FRAME($sp)
mflr r0
$STU $sp,`-($FRAME+16*$SZ)`($sp)
$SHL $num,$num,`log(16*$SZ)/log(2)`
$PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
......@@ -206,6 +208,7 @@ $func:
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
$LD $A,`0*$SZ`($ctx)
mr $inp,r4 ; incarnate $inp
......@@ -217,7 +220,7 @@ $func:
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
b LPICmeup
bl LPICmeup
LPICedup:
andi. r0,$inp,3
bne Lunaligned
......@@ -226,40 +229,14 @@ Laligned:
$PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
Ldone:
$POP r0,`$FRAME-$SIZE_T*21`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,`$FRAME+16*$SZ`
blr
___
b Ldone
# PowerPC specification allows an implementation to be ill-behaved
# upon unaligned access which crosses page boundary. "Better safe
# than sorry" principle makes me treat it specially. But I don't
# look for particular offending word, but rather for the input
# block which crosses the boundary. Once found that block is aligned
# and hashed separately...
$code.=<<___;
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for the input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
......@@ -278,7 +255,7 @@ Lunaligned:
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
addi r20,$sp,$FRAME ; aligned spot below the frame
addi r20,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
......@@ -293,8 +270,8 @@ Lmemcpy:
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$FRAME ; fictitious inp pointer
addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$LOCALS ; fictitious inp pointer
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
......@@ -303,10 +280,36 @@ Lmemcpy:
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
addic. $num,$num,`-16*$SZ` ; num--
bne- Lunaligned
b Ldone
___
$code.=<<___;
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
.align 4
Lsha2_block_private:
___
......@@ -372,6 +375,8 @@ $code.=<<___;
$ST $H,`7*$SZ`($ctx)
bne Lsha2_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
# Ugly hack here, because PPC assembler syntax seem to vary too
......@@ -379,22 +384,15 @@ ___
$code.=<<___;
.align 6
LPICmeup:
bl LPIC
addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop
b LPICedup
nop
nop
nop
nop
nop
LPIC: mflr $Tbl
mflr r0
bcl 20,31,\$+4
mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
addi $Tbl,$Tbl,`64-8`
mtlr r0
blr
nop
nop
nop
nop
nop
nop
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
___
$code.=<<___ if ($SZ==8);
.long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册