提交 8eed3289 编写于 作者: A Andy Polyakov

bn/asm/armv4-mont.pl: boost NEON performance.

Close difference gap on Cortex-A9, which resulted in further improvement
even on other processors.
Reviewed-by: NRichard Levitte <levitte@openssl.org>
上级 75f648aa
...@@ -38,6 +38,15 @@ ...@@ -38,6 +38,15 @@
# for execution on all NEON-capable processors, because gain on # for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9. # others outweighs the marginal loss on Cortex-A9.
# September 2015
#
# Align Cortex-A9 performance with November 2013 improvements, i.e.
# NEON code is now ~20-105% faster than integer-only one on this
# processor. But this optimization further improved performance even
# on other processors: NEON code path is ~45-180% faster than original
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
# Snapdragon S4.
$flavour = shift; $flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
...@@ -272,19 +281,16 @@ bn_mul_mont: ...@@ -272,19 +281,16 @@ bn_mul_mont:
.size bn_mul_mont,.-bn_mul_mont .size bn_mul_mont,.-bn_mul_mont
___ ___
{ {
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5"); my ($Z,$Temp)=("q4","q5");
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); my @ACC=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31)); my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero=&Dlo($Z); my $zero="$Z#lo";
my $temp=&Dlo($Temp); my $temp="$Temp#lo";
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
$code.=<<___; $code.=<<___;
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
...@@ -300,59 +306,58 @@ bn_mul8x_mont_neon: ...@@ -300,59 +306,58 @@ bn_mul8x_mont_neon:
ldmia ip,{r4-r5} @ load rest of parameter block ldmia ip,{r4-r5} @ load rest of parameter block
mov ip,sp mov ip,sp
sub $toutptr,sp,#16 cmp $num,#8
bhi .LNEON_8n
@ special case for $num==8, everything is in register bank...
vld1.32 {${Bi}[0]}, [$bptr,:32]! vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $toutptr,$toutptr,$num,lsl#4 veor $zero,$zero,$zero
sub $toutptr,sp,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64 and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32] vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca mov sp,$toutptr @ alloca
veor $zero,$zero,$zero
subs $inner,$num,#8
vzip.16 $Bi,$zero vzip.16 $Bi,$zero
vmull.u32 $A0xB,$Bi,${A0}[0] vmull.u32 @ACC[0],$Bi,${A0}[0]
vmull.u32 $A1xB,$Bi,${A0}[1] vmull.u32 @ACC[1],$Bi,${A0}[1]
vmull.u32 $A2xB,$Bi,${A1}[0] vmull.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vshl.i64 $Ni,@ACC[0]#hi,#16
vmull.u32 $A3xB,$Bi,${A1}[1] vmull.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero veor $zero,$zero,$zero
vmul.u32 $Ni,$temp,$M0 vmul.u32 $Ni,$Ni,$M0
vmull.u32 $A4xB,$Bi,${A2}[0] vmull.u32 @ACC[4],$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]! vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 $A5xB,$Bi,${A2}[1] vmull.u32 @ACC[5],$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0] vmull.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero vzip.16 $Ni,$zero
vmull.u32 $A7xB,$Bi,${A3}[1] vmull.u32 @ACC[7],$Bi,${A3}[1]
bne .LNEON_1st
@ special case for num=8, everything is in register bank... vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 $A0xB,$Ni,${N0}[0]
sub $outer,$num,#1 sub $outer,$num,#1
vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0] vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,$A0xB vmov $Temp,@ACC[0]
vmlal.u32 $A5xB,$Ni,${N2}[1] vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov $A0xB,$A1xB vmov @ACC[0],@ACC[1]
vmlal.u32 $A6xB,$Ni,${N3}[0] vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov $A1xB,$A2xB vmov @ACC[1],@ACC[2]
vmlal.u32 $A7xB,$Ni,${N3}[1] vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov $A2xB,$A3xB vmov @ACC[2],@ACC[3]
vmov $A3xB,$A4xB vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16 vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB vmov @ACC[4],@ACC[5]
vmov $A5xB,$A6xB vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,`&Dhi("$Temp")` vadd.u64 $temp,$temp,$Temp#hi
vmov $A6xB,$A7xB vmov @ACC[6],@ACC[7]
veor $A7xB,$A7xB veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16 vshr.u64 $temp,$temp,#16
b .LNEON_outer8 b .LNEON_outer8
...@@ -362,279 +367,302 @@ bn_mul8x_mont_neon: ...@@ -362,279 +367,302 @@ bn_mul8x_mont_neon:
vld1.32 {${Bi}[0]}, [$bptr,:32]! vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero veor $zero,$zero,$zero
vzip.16 $Bi,$zero vzip.16 $Bi,$zero
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0] vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 $A1xB,$Bi,${A0}[1] vmlal.u32 @ACC[1],$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0] vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16 vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 $A3xB,$Bi,${A1}[1] vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero veor $zero,$zero,$zero
subs $outer,$outer,#1 subs $outer,$outer,#1
vmul.u32 $Ni,$temp,$M0 vmul.u32 $Ni,$Ni,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0] vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0] vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1] vmlal.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 $A0xB,$Ni,${N0}[0] vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 $A1xB,$Ni,${N0}[1] vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0] vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,$A0xB vmov $Temp,@ACC[0]
vmlal.u32 $A5xB,$Ni,${N2}[1] vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov $A0xB,$A1xB vmov @ACC[0],@ACC[1]
vmlal.u32 $A6xB,$Ni,${N3}[0] vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov $A1xB,$A2xB vmov @ACC[1],@ACC[2]
vmlal.u32 $A7xB,$Ni,${N3}[1] vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov $A2xB,$A3xB vmov @ACC[2],@ACC[3]
vmov $A3xB,$A4xB vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16 vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB vmov @ACC[4],@ACC[5]
vmov $A5xB,$A6xB vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,`&Dhi("$Temp")` vadd.u64 $temp,$temp,$Temp#hi
vmov $A6xB,$A7xB vmov @ACC[6],@ACC[7]
veor $A7xB,$A7xB veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16 vshr.u64 $temp,$temp,#16
bne .LNEON_outer8 bne .LNEON_outer8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
mov $toutptr,sp mov $toutptr,sp
vshr.u64 $temp,`&Dlo("$A0xB")`,#16 vshr.u64 $temp,@ACC[0]#lo,#16
mov $inner,$num mov $inner,$num
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
add $tinptr,sp,#16 add $tinptr,sp,#96
vshr.u64 $temp,`&Dhi("$A0xB")`,#16 vshr.u64 $temp,@ACC[0]#hi,#16
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` vzip.16 @ACC[0]#lo,@ACC[0]#hi
b .LNEON_tail2 b .LNEON_tail_entry
.align 4 .align 4
.LNEON_1st: .LNEON_8n:
vmlal.u32 $A0xB,$Ni,${N0}[0] veor @ACC[0],@ACC[0],@ACC[0]
vld1.32 {$A0-$A3}, [$aptr]! sub $toutptr,sp,#128
vmlal.u32 $A1xB,$Ni,${N0}[1] veor @ACC[1],@ACC[1],@ACC[1]
subs $inner,$inner,#8 sub $toutptr,$toutptr,$num,lsl#4
vmlal.u32 $A2xB,$Ni,${N1}[0] veor @ACC[2],@ACC[2],@ACC[2]
vmlal.u32 $A3xB,$Ni,${N1}[1] and $toutptr,$toutptr,#-64
veor @ACC[3],@ACC[3],@ACC[3]
vmlal.u32 $A4xB,$Ni,${N2}[0] mov sp,$toutptr @ alloca
vld1.32 {$N0-$N1}, [$nptr]! veor @ACC[4],@ACC[4],@ACC[4]
vmlal.u32 $A5xB,$Ni,${N2}[1] add $toutptr,$toutptr,#256
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! veor @ACC[5],@ACC[5],@ACC[5]
vmlal.u32 $A6xB,$Ni,${N3}[0] sub $inner,$num,#8
vmlal.u32 $A7xB,$Ni,${N3}[1] veor @ACC[6],@ACC[6],@ACC[6]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! veor @ACC[7],@ACC[7],@ACC[7]
vmull.u32 $A0xB,$Bi,${A0}[0]
vld1.32 {$N2-$N3}, [$nptr]!
vmull.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmull.u32 $A2xB,$Bi,${A1}[0]
vmull.u32 $A3xB,$Bi,${A1}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmull.u32 $A4xB,$Bi,${A2}[0]
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
sub $outer,$num,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vshr.u64 $temp,$temp,#16
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
veor $Z,$Z,$Z
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vst1.64 {$Z}, [$toutptr,:128]
vshr.u64 $temp,$temp,#16
b .LNEON_outer .LNEON_8n_init:
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
subs $inner,$inner,#8
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
bne .LNEON_8n_init
add $tinptr,sp,#256
vld1.32 {$A0-$A3},[$aptr]!
add $bnptr,sp,#8
vld1.32 {${M0}[0]},[$n0,:32]
mov $outer,$num
b .LNEON_8n_outer
.align 4 .align 4
.LNEON_outer: .LNEON_8n_outer:
vld1.32 {${Bi}[0]}, [$bptr,:32]! vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
vld1.32 {$A0-$A3}, [$aptr]!
veor $zero,$zero,$zero veor $zero,$zero,$zero
mov $toutptr,sp
vzip.16 $Bi,$zero vzip.16 $Bi,$zero
sub $inner,$num,#8 add $toutptr,sp,#128
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 $A0xB,$Bi,${A0}[0] vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! vmlal.u32 @ACC[1],$Bi,${A0}[1]
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
veor $zero,$zero,$zero veor $zero,$zero,$zero
vadd.u64 $temp,$temp,`&Dlo("$A0xB")` vmlal.u32 @ACC[2],$Bi,${A1}[0]
vld1.64 {$A7xB},[$tinptr,:128]! vshl.i64 $Ni,@ACC[0]#hi,#16
vmul.u32 $Ni,$temp,$M0 vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 $A4xB,$Bi,${A2}[0] vmlal.u32 @ACC[4],$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]! vmul.u32 $Ni,$Ni,$M0
vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0] vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=0; $i<7;) {
$code.=<<___;
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
vmlal.u32 @ACC[0],$Ni,${N0}[0]
veor $temp,$temp,$temp
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vzip.16 $Bi,$temp
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
___
push(@ACC,shift(@ACC)); $i++;
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]!
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1] vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
add $bnptr,sp,#8 @ rewind
___
push(@ACC,shift(@ACC));
$code.=<<___;
sub $inner,$num,#8
b .LNEON_8n_inner
.LNEON_inner: .align 4
vmlal.u32 $A0xB,$Ni,${N0}[0] .LNEON_8n_inner:
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8 subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0] vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1] vld1.64 {@ACC[7]},[$tinptr,:128]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
vmlal.u32 $A4xB,$Ni,${N2}[0] vmlal.u32 @ACC[2],$Bi,${A1}[0]
vld1.64 {$A0xB}, [$tinptr, :128]! vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 $A5xB,$Ni,${N2}[1] vmlal.u32 @ACC[3],$Bi,${A1}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! it ne
vmlal.u32 $A6xB,$Ni,${N3}[0] addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 $A7xB,$Ni,${N3}[1] vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 $A0xB,$Bi,${A0}[0] ___
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! for ($i=1; $i<8; $i++) {
vmlal.u32 $A1xB,$Bi,${A0}[1] $code.=<<___;
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
vmlal.u32 $A2xB,$Bi,${A1}[0] vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 $A3xB,$Bi,${A1}[1] vmlal.u32 @ACC[2],$Ni,${N1}[0]
vld1.32 {$N0-$N3}, [$nptr]! vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 $A4xB,$Bi,${A2}[0] vmlal.u32 @ACC[5],$Ni,${N2}[1]
vld1.64 {$A7xB}, [$tinptr, :128]! vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 $A5xB,$Bi,${A2}[1] vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0] vst1.64 {@ACC[0]},[$toutptr,:128]!
vmlal.u32 $A7xB,$Bi,${A3}[1] ___
push(@ACC,shift(@ACC));
bne .LNEON_inner $code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 $A0xB,$Ni,${N0}[0] vld1.64 {@ACC[7]},[$tinptr,:128]
add $tinptr,sp,#16 vmlal.u32 @ACC[1],$Bi,${A0}[1]
vmlal.u32 $A1xB,$Ni,${N0}[1] vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr vmlal.u32 @ACC[2],$Bi,${A1}[0]
vmlal.u32 $A2xB,$Ni,${N1}[0] it ne
vld1.64 {$Temp}, [sp,:128] addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 $A3xB,$Ni,${N1}[1] vmlal.u32 @ACC[3],$Bi,${A1}[1]
subs $outer,$outer,#1 vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0] vmlal.u32 @ACC[6],$Bi,${A3}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! vmlal.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 $A5xB,$Ni,${N2}[1] ___
vld1.64 {$A0xB}, [$tinptr, :128]! }
vshr.u64 $temp,$temp,#16 $code.=<<___;
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! it eq
vmlal.u32 $A6xB,$Ni,${N3}[0] subeq $aptr,$aptr,$num,lsl#2 @ rewind
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 $A7xB,$Ni,${N3}[1] vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! vld1.32 {$A0-$A3},[$aptr]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")` vmlal.u32 @ACC[2],$Ni,${N1}[0]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! add $bnptr,sp,#8 @ rewind
vshr.u64 $temp,$temp,#16 vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vst1.64 {@ACC[0]},[$toutptr,:128]!
vmlal.u32 @ACC[7],$Ni,${N3}[1]
bne .LNEON_8n_inner
___
push(@ACC,shift(@ACC));
$code.=<<___;
add $tinptr,sp,#128
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
veor q2,q2,q2 @ $N0-$N1
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
veor q3,q3,q3 @ $N2-$N3
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]},[$toutptr,:128]
subs $outer,$outer,#8
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
bne .LNEON_outer itt ne
subne $nptr,$nptr,$num,lsl#2 @ rewind
bne .LNEON_8n_outer
add $toutptr,sp,#128
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
vshr.u64 $temp,@ACC[0]#lo,#16
vst1.64 {q2-q3},[sp,:256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vst1.64 {q2-q3}, [sp,:256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vst1.64 {q2-q3}, [sp,:256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
mov $toutptr,sp
mov $inner,$num mov $inner,$num
b .LNEON_tail_entry
.align 4
.LNEON_tail: .LNEON_tail:
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! vshr.u64 $temp,@ACC[0]#lo,#16
vshr.u64 $temp,`&Dlo("$A0xB")`,#16 vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A0xB")`,#16 vshr.u64 $temp,@ACC[0]#hi,#16
vld1.64 {$A7xB}, [$tinptr, :128]! vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` vzip.16 @ACC[0]#lo,@ACC[0]#hi
.LNEON_tail2: .LNEON_tail_entry:
vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp ___
vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! for ($i=1; $i<8; $i++) {
vshr.u64 $temp,`&Dlo("$A1xB")`,#16 $code.=<<___;
vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
vshr.u64 $temp,`&Dhi("$A1xB")`,#16 vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` vshr.u64 $temp,@ACC[1]#lo,#16
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp vshr.u64 $temp,@ACC[1]#hi,#16
vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! vzip.16 @ACC[1]#lo,@ACC[1]#hi
vshr.u64 $temp,`&Dlo("$A2xB")`,#16 ___
vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp push(@ACC,shift(@ACC));
vshr.u64 $temp,`&Dhi("$A2xB")`,#16 }
vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` push(@ACC,shift(@ACC));
$code.=<<___;
vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A3xB")`,#16
vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
vshr.u64 $temp,`&Dhi("$A3xB")`,#16
vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A4xB")`,#16
vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
vshr.u64 $temp,`&Dhi("$A4xB")`,#16
vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A5xB")`,#16
vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
vshr.u64 $temp,`&Dhi("$A5xB")`,#16
vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A6xB")`,#16
vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,`&Dhi("$A6xB")`,#16
vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A7xB")`,#16
vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A7xB")`,#16
vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
subs $inner,$inner,#8 subs $inner,$inner,#8
vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
bne .LNEON_tail bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
...@@ -708,8 +736,14 @@ $code.=<<___; ...@@ -708,8 +736,14 @@ $code.=<<___;
#endif #endif
___ ___
$code =~ s/\`([^\`]*)\`/eval $1/gem; foreach (split("\n",$code)) {
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 s/\`([^\`]*)\`/eval $1/ge;
$code =~ s/\bret\b/bx lr/gm;
print $code; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
s/\bret\b/bx lr/g or
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; close STDOUT;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册