“3d068261854b00c930df4516cd617900935e7706”上不存在“Documentation/git@gitcode.net:openanolis/cloud-kernel.git”
提交 c4d9ef4c 编写于 作者: A Andy Polyakov

sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%.

Biggest part, ~7%, of improvement resulted from omitting constants'
table index increment in each round. And minor part from rescheduling
instructions. Apparently POWER9 (and POWER8) manage to dispatch
instructions more efficiently if they are laid down as if they have
no latency...
Reviewed-by: NRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6406)
上级 1a467bd1
......@@ -37,8 +37,8 @@
# build of sha512-ppc.pl, presented for reference.
#
# POWER8 POWER9
# SHA256 9.9 [15.8] 12.2 [12.5]
# SHA512 6.3 [10.3] 7.7 [7.9]
# SHA256 9.7 [15.8] 11.2 [12.5]
# SHA512 6.1 [10.3] 7.0 [7.9]
$flavour=shift;
$output =shift;
......@@ -79,7 +79,8 @@ if ($output =~ /512/) {
}
$func="sha${bits}_block_p8";
$FRAME=8*$SIZE_T;
$LOCALS=8*$SIZE_T+8*16;
$FRAME=$LOCALS+9*16+6*$SIZE_T;
$sp ="r1";
$toc="r2";
......@@ -91,16 +92,17 @@ $idx="r7";
$lrsave="r8";
$offload="r11";
$vrsave="r12";
($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
$x00=0 if ($flavour =~ /osx/);
@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
$x00=0 if ($flavour =~ /osx/);
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
@X=map("v$_",(8..23));
($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
@X=map("v$_",(8..19,24..27));
($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
sub ROUND {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)%16;
my $k=($i+2)%8;
$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
lvx_u @X[$i+1],0,$inp ; load X[i] in advance
......@@ -112,26 +114,30 @@ ___
$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
vperm @X[$i],@X[$i],@X[$i],$lemask
___
$code.=<<___ if ($i>=15);
vshasigma${sz} $Sigma,@X[($j+1)%16],0,0
vaddu${sz}m @X[$j],@X[$j],$Sigma
vshasigma${sz} $Sigma,@X[($j+14)%16],0,15
vaddu${sz}m @X[$j],@X[$j],$Sigma
vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]
___
$code.=<<___;
`"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
vsel $Func,$g,$f,$e ; Ch(e,f,g)
vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
`"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
vsel $Func,$g,$f,$e ; Ch(e,f,g)
vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e)
vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e)
vxor $Func,$a,$b
`"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
vsel $Func,$b,$c,$Func ; Maj(a,b,c)
vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $d,$d,$h ; d+=h
vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
`"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
lvx $Ki,$idx,$Tbl ; load next K[i]
addi $idx,$idx,16
vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
`"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a)
vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c)
vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c)
lvx $Ki,@I[$k],$idx ; load next K[i]
___
$code.=<<___ if ($k == 7);
addi $idx,$idx,0x80
___
}
......@@ -142,21 +148,13 @@ $code=<<___;
.globl $func
.align 6
$func:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
$STU $sp,-$FRAME($sp)
mflr $lrsave
li r10,`$FRAME+8*16+15`
li r11,`$FRAME+8*16+31`
stvx v20,r10,$sp # ABI says so
li r10,`$LOCALS+15`
li r11,`$LOCALS+31`
stvx v24,r10,$sp # ABI says so
addi r10,r10,32
mfspr $vrsave,256
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
......@@ -169,26 +167,26 @@ $func:
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
li r11,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li r11,-4096+255
stw $vrsave,`$FRAME+6*$SIZE_T-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$PUSH r26,`$FRAME-6*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$PUSH r27,`$FRAME-5*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$PUSH r28,`$FRAME-4*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$PUSH r29,`$FRAME-3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$PUSH r30,`$FRAME-2*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
$PUSH r31,`$FRAME-1*$SIZE_T`($sp)
li $x70,0x70
$PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
$PUSH $lrsave,`$FRAME+$LRSAVE`($sp)
mtspr 256,r11
bl LPICmeup
addi $offload,$sp,$FRAME+15
addi $offload,$sp,`8*$SIZE_T+15`
___
$code.=<<___ if ($LENDIAN);
li $idx,8
......@@ -222,9 +220,9 @@ $code.=<<___;
.align 5
Loop:
lvx $Ki,$x00,$Tbl
li $idx,16
lvx_u @X[0],0,$inp
addi $inp,$inp,16
mr $idx,$Tbl # copy $Tbl
stvx $A,$x00,$offload # offload $A-$H
stvx $B,$x10,$offload
stvx $C,$x20,$offload
......@@ -234,8 +232,7 @@ Loop:
stvx $G,$x60,$offload
stvx $H,$x70,$offload
vaddu${sz}m $H,$H,$Ki # h+K[i]
lvx $Ki,$idx,$Tbl
addi $idx,$idx,16
lvx $Ki,$x10,$Tbl
___
for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
......@@ -268,10 +265,9 @@ $code.=<<___;
bne Loop
___
$code.=<<___ if ($SZ==4);
lvx @X[0],$idx,$Tbl
addi $idx,$idx,16
lvx @X[0],$x20,$idx
vperm $A,$A,$B,$Ki # pack the answer
lvx @X[1],$idx,$Tbl
lvx @X[1],$x30,$idx
vperm $E,$E,$F,$Ki
vperm $A,$A,$C,@X[0]
vperm $E,$E,$G,@X[0]
......@@ -291,19 +287,11 @@ $code.=<<___ if ($SZ==8);
stvx_u $G,$x30,$ctx
___
$code.=<<___;
li r10,`$FRAME+8*16+15`
li r10,`$LOCALS+15`
mtlr $lrsave
li r11,`$FRAME+8*16+31`
li r11,`$LOCALS+31`
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
lvx v24,r10,$sp # ABI says so
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
......@@ -317,13 +305,13 @@ $code.=<<___;
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
$POP r26,`$FRAME-6*$SIZE_T`($sp)
$POP r27,`$FRAME-5*$SIZE_T`($sp)
$POP r28,`$FRAME-4*$SIZE_T`($sp)
$POP r29,`$FRAME-3*$SIZE_T`($sp)
$POP r30,`$FRAME-2*$SIZE_T`($sp)
$POP r31,`$FRAME-1*$SIZE_T`($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,6,3,0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册