提交 866e505e 编写于 作者: A Andy Polyakov

sha/asm/sha512-armv8.pl: add NEON version of SHA256.

This provides up to 30% better performance on some of recent processors.
Reviewed-by: NRichard Levitte <levitte@openssl.org>
上级 79dfc3dd
...@@ -37,6 +37,20 @@ ...@@ -37,6 +37,20 @@
# indication of some compiler "pathology", most notably code # indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significanty faster # generated with -mgeneral-regs-only is significanty faster
# and the gap is only 40-90%. # and the gap is only 40-90%.
#
# October 2016.
#
# Originally it was reckoned that it makes no sense to implement NEON
# version of SHA256 for 64-bit processors. This is because performance
# improvement on most wide-spread Cortex-A5x processors was observed
# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
# observed that 32-bit NEON SHA256 performs significantly better than
# 64-bit scalar version on *some* of the more recent processors. As
# result 64-bit NEON version of SHA256 was added to provide best
# all-round performance. For example it executes ~30% faster on X-Gene
# and Mongoose. [For reference, NEON version of SHA512 is bound to
# deliver much less improvement, likely *negative* on Cortex-A5x.
# Which is why NEON support is limited to SHA256.]
$output=pop; $output=pop;
$flavour=pop; $flavour=pop;
...@@ -195,6 +209,8 @@ $code.=<<___ if ($SZ==4); ...@@ -195,6 +209,8 @@ $code.=<<___ if ($SZ==4);
ldr w16,[x16] ldr w16,[x16]
tst w16,#ARMV8_SHA256 tst w16,#ARMV8_SHA256
b.ne .Lv8_entry b.ne .Lv8_entry
tst w16,#ARMV7_NEON
b.ne .Lneon_entry
#endif #endif
___ ___
$code.=<<___; $code.=<<___;
...@@ -425,6 +441,296 @@ $code.=<<___; ...@@ -425,6 +441,296 @@ $code.=<<___;
___ ___
} }
if ($SZ==4) { ######################################### NEON stuff #
# You'll surely note a lot of similarities with sha256-armv4 module,
# and of course it's not a coincidence. sha256-armv4 was used as
# initial template, but was adapted for ARMv8 instruction set and
# extensively re-tuned for all-round performance.
my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
my $Ktbl="x16";
my $Xfer="x17";
my @X = map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
my $j=0;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
&ushr_32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
eval(shift(@insns));
&sli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T4,$T7,$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T4,$T7,32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T5,$T7,$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T3,$T7,$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&sli_u32 ($T3,$T7,32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T6,@X[0],$sigma1[0]);
eval(shift(@insns));
&ushr_32 ($T7,@X[0],$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T6,@X[0],32-$sigma1[0]);
eval(shift(@insns));
&ushr_32 ($T5,@X[0],$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T7,$T7,$T6);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T5,@X[0],32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&ld1_32 ("{$T0}","[$Ktbl], #16");
eval(shift(@insns));
&eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T5);
eval(shift(@insns));
eval(shift(@insns));
&mov (&Dhi($T5), &Dlo($T7));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 ($T0,$T0,@X[0]);
while($#insns>=1) { eval(shift(@insns)); }
&st1_32 ("{$T0}","[$Xfer], #16");
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
&ld1_8 ("{@X[0]}","[$inp],#16");
eval(shift(@insns));
eval(shift(@insns));
&ld1_32 ("{$T0}","[$Ktbl],#16");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&rev32 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&st1_32 ("{$T0}","[$Xfer], #16");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
'&and ($t1,$f,$e)',
'&bic ($t4,$g,$e)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&orr ($t1,$t1,$t4)', # Ch(e,f,g)
'&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ror ($t0,$t0,"#$Sigma1[0]")',
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t0)', # h+=Sigma1(e)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&ror ($t4,$t4,"#$Sigma0[0]")',
'&add ($d,$d,$h)', # d+=h
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#ifdef __KERNEL__
.globl sha256_block_neon
#endif
.type sha256_block_neon,%function
.align 4
sha256_block_neon:
.Lneon_entry:
stp x29, x30, [sp, #-16]!
mov x29, sp
sub sp,sp,#16*4
adr $Ktbl,.LK256
add $num,$inp,$num,lsl#6 // len to point at the end of inp
ld1.8 {@X[0]},[$inp], #16
ld1.8 {@X[1]},[$inp], #16
ld1.8 {@X[2]},[$inp], #16
ld1.8 {@X[3]},[$inp], #16
ld1.32 {$T0},[$Ktbl], #16
ld1.32 {$T1},[$Ktbl], #16
ld1.32 {$T2},[$Ktbl], #16
ld1.32 {$T3},[$Ktbl], #16
rev32 @X[0],@X[0] // yes, even on
rev32 @X[1],@X[1] // big-endian
rev32 @X[2],@X[2]
rev32 @X[3],@X[3]
mov $Xfer,sp
add.32 $T0,$T0,@X[0]
add.32 $T1,$T1,@X[1]
add.32 $T2,$T2,@X[2]
st1.32 {$T0-$T1},[$Xfer], #32
add.32 $T3,$T3,@X[3]
st1.32 {$T2-$T3},[$Xfer]
sub $Xfer,$Xfer,#32
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldp $E,$F,[$ctx,#16]
ldp $G,$H,[$ctx,#24]
ldr $t1,[sp,#0]
mov $t2,wzr
eor $t3,$B,$C
mov $t4,wzr
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
cmp $t1,#0 // check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
cmp $inp,$num
mov $Xfer, #64
csel $Xfer, $Xfer, xzr, eq
sub $inp,$inp,$Xfer // avoid SEGV
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
add $A,$A,$t4 // h+=Sigma0(a) from the past
ldp $t0,$t1,[$ctx,#0]
add $A,$A,$t2 // h+=Maj(a,b,c) from the past
ldp $t2,$t3,[$ctx,#8]
add $A,$A,$t0 // accumulate
add $B,$B,$t1
ldp $t0,$t1,[$ctx,#16]
add $C,$C,$t2
add $D,$D,$t3
ldp $t2,$t3,[$ctx,#24]
add $E,$E,$t0
add $F,$F,$t1
ldr $t1,[sp,#0]
stp $A,$B,[$ctx,#0]
add $G,$G,$t2
mov $t2,wzr
stp $C,$D,[$ctx,#8]
add $H,$H,$t3
stp $E,$F,[$ctx,#16]
eor $t3,$B,$C
stp $G,$H,[$ctx,#24]
mov $t4,wzr
mov $Xfer,sp
b.ne .L_00_48
ldr x29,[x29]
add sp,sp,#16*4+16
ret
.size sha256_block_neon,.-sha256_block_neon
___
}
$code.=<<___; $code.=<<___;
#ifndef __KERNEL__ #ifndef __KERNEL__
.comm OPENSSL_armcap_P,4,4 .comm OPENSSL_armcap_P,4,4
...@@ -456,12 +762,15 @@ close SELF; ...@@ -456,12 +762,15 @@ close SELF;
foreach(split("\n",$code)) { foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo; s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
s/\.\w?32\b//o and s/\.16b/\.4s/go; s/\.[ui]?8(\s)/$1/;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; s/\.\w?32\b// and s/\.16b/\.4s/g;
m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
print $_,"\n"; print $_,"\n";
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册