提交 1c3d2b94 编写于 作者: A Andy Polyakov

This is "informational" commit. Its mere purpose is to expose "modulo

factor" in inner loops.
上级 48d2335d
......@@ -286,19 +286,16 @@ $fname:
!or %o7,%o0,%o0 ! 64-bit result
srlx %o3,16,%g1 ! 34-bit carry
ba .L1st
add $j,8,$j
.align 32
.L1st:
add $ap,$j,%o3
add $np,$j,%o4
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
add $ap,$j,%o4
add $np,$j,%o5
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o3+4],$ahi_
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o4+0],$nlo_ ! load n[j] as pair of 32-bit words
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o4+4],$nhi_
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
......@@ -350,6 +347,11 @@ $fname:
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bz,pn %icc,.L1stskip
.align 32,0x1000000
.L1st:
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
......@@ -376,9 +378,101 @@ $fname:
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
add $ap,$j,%o4
add $np,$j,%o5
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$ba,$aloa
std $ahi,[$ap_h+$j]
fmuld $nlo,$na,$nloa
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bb,$alob
std $nhi,[$np_h+$j]
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $dota,$nloa,$nloa
faddd $dotb,$nlob,$nlob
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bnz,pt %icc,.L1st
add $tp,8,$tp
.L1stskip:
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
add %o7,%o1,%o1
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
addcc %g1,%o0,%o0
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
add $tp,8,$tp
fdtox $dota,$dota
fdtox $dotb,$dotb
......@@ -514,10 +608,7 @@ $fname:
bcs,a %xcc,.+8
add %g1,1,%g1
ba .Linner
add $j,8,$j
.align 32
.Linner:
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
......@@ -563,6 +654,11 @@ $fname:
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bz,pn %icc,.Linnerskip
.align 32,0x1000000
.Linner:
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
......@@ -594,9 +690,91 @@ $fname:
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $dota,$nloa,$nloa
faddd $dotb,$nlob,$nlob
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bnz,pt %icc,.Linner
add $tp,8,$tp
.Linnerskip:
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
add %o7,%o1,%o1
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
addcc %g1,%o0,%o0
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
ldx [$tp+8],%o7 ! tp[j]
addcc %o7,%o0,%o0
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
add $tp,8,$tp
fdtox $dota,$dota
fdtox $dotb,$dotb
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册