Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
e52aa2b1
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e52aa2b1
编写于
7月 31, 2020
作者:
L
lixian
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix compile bugs on arm32 and add BiasAdd/Relu/Dw arm64 kernels
上级
b6726e4a
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
1019 addition
and
118 deletion
+1019
-118
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmFp32_8x4.S
...e/kernel/arm/opclib/assembly/arm32/IndirectGemmFp32_8x4.S
+95
-85
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmInt8_2x4.S
...e/kernel/arm/opclib/assembly/arm32/IndirectGemmInt8_2x4.S
+38
-27
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAdd.S
.../src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAdd.S
+131
-0
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu.S
.../runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu.S
+137
-0
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu6.S
...runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu6.S
+146
-0
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu.S
...ite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu.S
+132
-0
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu6.S
...te/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu6.S
+140
-0
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S
...ntime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S
+96
-0
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S
...ime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S
+77
-0
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc
...re/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc
+5
-6
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
...ore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
+11
-0
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc
...lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc
+11
-0
未找到文件。
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmFp32_8x4.S
浏览文件 @
e52aa2b1
#ifdef __aarch64__
#ifdef __arm__
#ifndef __aarch64__
.
text
.
align
5
...
...
@@ -10,15 +11,16 @@
//
void
IndirectGemmFp32_8x4
(
float
*
output
,
float
*
input
,
float
*
weight
,
float
*
bias
,
//
size_t
kSize
,
size_t
ic4
,
size_t
oc8
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
)
;
//
r0
:
output
,
r1
:
input
,
r2
:
weight
,
r3
:
bias
,
r4
:
kSize
,
r5
:
ic4
,
r6
:
oc
,
r7
:
offset
//
r8
:
mode
,
r10
:
writeMode
,
x
10
:
relu
,
r10
:
relu6
//
r8
:
mode
,
r10
:
writeMode
,
r
10
:
relu
,
r10
:
relu6
//
mode
=
0
for
general
convolution
,
where
one
conv
unit
is
a
row
//
mode
=
1
for
winograd
/
common
gemm
,
where
the
total
channels
of
one
input
is
a
row
IndirectGemmFp32_8x4
:
.
macro
INIT_BIAS
veor
q10
,
q10
,
q10
cbz
x3
,
InitBias
vld1.32
q10
,
[
x3
]
cmp
r3
,
#
0
beq
InitBias
vld1.32
q10
,
[
r3
]
InitBias
:
vmov
q11
,
q10
vmov
q12
,
q10
...
...
@@ -27,10 +29,11 @@ IndirectGemmFp32_8x4:
vmov
q15
,
q10
.
endm
//
registers
v8
~
v15
must
be
preserved
by
a
callee
across
subroutine
calls
,
according
to
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
r19
~
r29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
//
at
return
,
clang
generates
"push {lr}, pop {pc}"" while gcc will generate "
bx
lr
"
//
according
to
https
:
//
stackoverflow
.
com
/
questions
/
53625807
//
even
if
we
jump
to
link
register
instead
of
saving
it
,
we
still
have
to
save
it
in
subroutine
calls
anyway
//
clang
's rule seems more simple, though there are no subroutine calls here
//
r4
-
r8
and
q4
-
q7
must
be
saved
according
to
https
:
//
static
.
docs
.
arm
.
com
/
ihi0042
/
i
/
aapcs32
.
pdf
push
{
r4
-
r8
,
r10
,
r11
,
lr
}
vpush
{
q4
-
q7
}
add
sp
,
sp
,
#
160
...
...
@@ -41,7 +44,8 @@ IndirectGemmFp32_8x4:
ldr
r7
,
[
sp
,
#
12
]
ldr
r8
,
[
sp
,
#
16
]
cbnz
r8
,
LoopOc
cmp
r8
,
#
0
bne
LoopOc
//
step
is
one
for
common
convolution
,
where
ic8
should
multiply
by
kernel
size
//
step
is
(
a
+
b
-
1
)
for
F
(
a
,
b
)
in
winograd
mul
r5
,
r4
,
r5
...
...
@@ -57,17 +61,18 @@ IndirectGemmFp32_8x4:
INIT_BIAS
//
load
input
for
output
1
-
2
vld1.32
{
q0
,
q1
,
q2
,
q3
},
[
x12
]!
vld1.32
{
q0
,
q1
},
[
r12
]!
vld1.32
{
q2
,
q3
},
[
r12
]!
//
load
weight
vld1.32
{
q4
,
q5
},
[
x
2
]!
vld1.32
{
q4
,
q5
},
[
r
2
]!
//
step
for
output
1
-
2
vmul.f32
q8
,
q4
,
d0
[
0
]
vmul.f32
q9
,
q4
,
d2
[
0
]
vmla.f32
q8
,
q5
,
d0
[
1
]
vmla.f32
q9
,
q5
,
d2
[
1
]
vld1.32
{
q6
,
q7
},
[
x
2
]!
vld1.32
{
q6
,
q7
},
[
r
2
]!
subs
x10
,
x
5
,
#
1
subs
r10
,
r
5
,
#
1
beq
LoopIcEnd
LoopIc
:
...
...
@@ -146,9 +151,11 @@ IndirectGemmFp32_8x4:
vmla.f32
q15
,
q7
,
d7
[
1
]
ldr
r10
,
[
sp
,
#
28
]
cbnz
r10
,
Relu6
cmp
r10
,
#
0
bne
Relu6
ldr
r10
,
[
sp
,
#
24
]
cbnz
x10
,
Relu
cmp
r10
,
#
0
bne
Relu
b
WriteStart
Relu6
:
vmov.i32
q14
,
#
6
...
...
@@ -174,7 +181,8 @@ IndirectGemmFp32_8x4:
WriteStart
:
ldr
r10
,
[
sp
,
#
20
]
cbnz
x10
,
WriteC4
cmp
r10
,
#
0
bne
WriteC4
cmp
r6
,
#
1
beq
Write1
cmp
r6
,
#
2
...
...
@@ -183,97 +191,97 @@ IndirectGemmFp32_8x4:
beq
Write3
b
Write4
Write1
:
str
s0
,
[
r11
]
add
r11
,
r11
,
x
7
str
s4
,
[
r11
]
add
r11
,
r11
,
x
7
str
s8
,
[
r11
]
add
r11
,
r11
,
x
7
str
s12
,
[
r11
]
add
r11
,
r11
,
x
7
str
s16
,
[
r11
]
add
r11
,
r11
,
x
7
str
s20
,
[
r11
]
add
r11
,
r11
,
x
7
str
s24
,
[
r11
]
add
r11
,
r11
,
x
7
str
s28
,
[
r11
]
vst1.32
d0
[
0
]
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d2
[
0
]
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d4
[
0
]
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d6
[
0
]
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d8
[
0
]
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d10
[
0
]
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d12
[
0
]
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d14
[
0
]
,
[
r11
]
add
r0
,
r0
,
#
4
b
WriteEnd
Write2
:
str
d0
,
[
r11
]
add
r11
,
r11
,
x
7
str
d2
,
[
r11
]
add
r11
,
r11
,
x
7
str
d4
,
[
r11
]
add
r11
,
r11
,
x
7
str
d6
,
[
r11
]
add
r11
,
r11
,
x
7
str
d8
,
[
r11
]
add
r11
,
r11
,
x
7
str
d10
,
[
r11
]
add
r11
,
r11
,
x
7
str
d12
,
[
r11
]
add
r11
,
r11
,
x
7
str
d14
,
[
r11
]
vst1.32
d0
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d2
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d4
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d6
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d8
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d10
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d12
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d14
,
[
r11
]
add
r0
,
r0
,
#
8
b
WriteEnd
Write3
:
add
r12
,
r11
,
#
8
str
d0
,
[
r11
]
add
r11
,
r11
,
x
7
str
s2
,
[
r12
]
vst1.32
d0
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d1
[
0
]
,
[
r12
]
add
r12
,
r12
,
r7
str
d2
,
[
r11
]
add
r11
,
r11
,
x
7
str
s6
,
[
r12
]
vst1.32
d2
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d3
[
0
]
,
[
r12
]
add
r12
,
r12
,
r7
str
d4
,
[
r11
]
add
r11
,
r11
,
x
7
str
s10
,
[
r12
]
vst1.32
d4
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d5
[
0
]
,
[
r12
]
add
r12
,
r12
,
r7
str
d6
,
[
r11
]
add
r11
,
r11
,
x
7
str
s14
,
[
r12
]
vst1.32
d6
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d7
[
0
]
,
[
r12
]
add
r12
,
r12
,
r7
str
d8
,
[
r11
]
add
r11
,
r11
,
x
7
str
s18
,
[
r12
]
vst1.32
d8
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d9
[
0
]
,
[
r12
]
add
r12
,
r12
,
r7
str
d10
,
[
r11
]
add
r11
,
r11
,
x
7
str
s22
,
[
r12
]
vst1.32
d10
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d11
[
0
]
,
[
r12
]
add
r12
,
r12
,
r7
str
d12
,
[
r11
]
add
r11
,
r11
,
x
7
str
s26
,
[
r12
]
vst1.32
d12
,
[
r11
]
add
r11
,
r11
,
r
7
vst1.32
d13
[
0
]
,
[
r12
]
add
r12
,
r12
,
r7
str
d14
,
[
r11
]
str
s30
,
[
r12
]
vst1.32
d14
,
[
r11
]
vst1.32
d15
[
0
]
,
[
r12
]
add
r0
,
r0
,
#
12
b
WriteEnd
WriteC4
:
vst1.32
q0
,
[
r11
],
x
7
vst1.32
q1
,
[
r11
],
x
7
vst1.32
q2
,
[
r11
],
x
7
vst1.32
q3
,
[
r11
],
x
7
vst1.32
q4
,
[
r11
],
x
7
vst1.32
q5
,
[
r11
],
x
7
vst1.32
q6
,
[
r11
],
x
7
vst1.32
q0
,
[
r11
],
r
7
vst1.32
q1
,
[
r11
],
r
7
vst1.32
q2
,
[
r11
],
r
7
vst1.32
q3
,
[
r11
],
r
7
vst1.32
q4
,
[
r11
],
r
7
vst1.32
q5
,
[
r11
],
r
7
vst1.32
q6
,
[
r11
],
r
7
vst1.32
q7
,
[
r11
]
add
r0
,
r0
,
#
16
b
WriteEnd
Write4
:
//
prefetching
is
not
prefered
while
writing
results
in
spite
of
cache
missings
//
you
could
try
prfm
pstl2
str
m
//
you
could
try
prfm
pstl2
vst1
.32
m
//
there
are
almost
no
benefits
observed
though
vst1.32
q0
,
[
r11
],
x
7
vst1.32
q1
,
[
r11
],
x
7
vst1.32
q2
,
[
r11
],
x
7
vst1.32
q3
,
[
r11
],
x
7
vst1.32
q4
,
[
r11
],
x
7
vst1.32
q5
,
[
r11
],
x
7
vst1.32
q6
,
[
r11
],
x
7
vst1.32
q0
,
[
r11
],
r
7
vst1.32
q1
,
[
r11
],
r
7
vst1.32
q2
,
[
r11
],
r
7
vst1.32
q3
,
[
r11
],
r
7
vst1.32
q4
,
[
r11
],
r
7
vst1.32
q5
,
[
r11
],
r
7
vst1.32
q6
,
[
r11
],
r
7
vst1.32
q7
,
[
r11
]
add
r0
,
r0
,
#
16
...
...
@@ -283,7 +291,8 @@ IndirectGemmFp32_8x4:
bne
LoopKsize
subs
r6
,
r6
,
#
4
cbz
r3
,
NoStepFowrard
cmp
r3
,
#
0
beq
NoStepFowrard
add
r3
,
r3
,
#
16
NoStepFowrard
:
bgt
LoopOc
...
...
@@ -292,3 +301,4 @@ IndirectGemmFp32_8x4:
vpop
{
q4
-
q7
}
pop
{
r4
-
r8
,
r10
,
r11
,
pc
}
#endif
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm32/IndirectGemmInt8_2x4.S
浏览文件 @
e52aa2b1
#ifdef __aarch64__
#ifdef __arm__
#ifndef __aarch64__
.
text
.
align
5
...
...
@@ -10,8 +11,8 @@
//
void
IndirectGemmInt8_2x4
(
int8_t
*
output
,
int8_t
*
input
,
int8_t
*
weight
,
int32_t
*
bias
,
size_t
ksize
,
size_t
ic4
,
//
size_t
oc
,
size_t
offset
,
int32_t
*
input_sum
,
size_t
act_min
,
size_t
act_max
,
size_t
out_zp
,
size_t
out_multiplier
,
//
size_t
shift_before
,
size_t
shift_after
)
;
//
x0
:
output
,
x1
:
input
,
r2
:
weight
,
x3
:
bias
,
x4
:
kSize
,
x5
:
ic4
,
x6
:
oc
,
x
7
:
offset
//
x8
:
input_sum
,
x10
:
act_min
,
x11
:
act_max
,
x10
:
out_zp
,
x11
:
out_multiplier
,
x10
:
shift_before
,
x
11
:
shift_after
//
r0
:
output
,
r1
:
input
,
r2
:
weight
,
r3
:
bias
,
r4
:
kSize
,
r5
:
ic4
,
r6
:
oc
,
r
7
:
offset
//
r8
:
input_sum
,
r10
:
act_min
,
r11
:
act_max
,
r10
:
out_zp
,
r11
:
out_multiplier
,
r10
:
shift_before
,
r
11
:
shift_after
IndirectGemmInt8_2x4
:
.
macro
INIT_BIAS
...
...
@@ -73,7 +74,7 @@ IndirectGemmInt8_2x4:
vmlal.s8
q6
,
d1
,
d9
vmlal.s8
q7
,
d1
,
d11
subs
x10
,
x
5
,
#
1
subs
r10
,
r
5
,
#
1
beq
LoopIcEnd
LoopIc
:
...
...
@@ -107,7 +108,7 @@ IndirectGemmInt8_2x4:
vmlal.s8
q6
,
d1
,
d9
vmlal.s8
q7
,
d1
,
d11
subs
x10
,
x
10
,
#
1
subs
r10
,
r
10
,
#
1
bne
LoopIc
LoopIcEnd
:
...
...
@@ -131,15 +132,23 @@ IndirectGemmInt8_2x4:
vld1.32
q0
[],
[
r10
]!
vld1.32
q1
[],
[
r10
]!
//
pairwise
add
vpadd.i32
q8
,
q8
,
q9
vpadd.i32
q10
,
q10
,
q11
vpadd.i32
q12
,
q12
,
q13
vpadd.i32
q14
,
q14
,
q15
vpadd.i32
q8
,
q8
,
q10
vpadd.i32
q12
,
q12
,
q14
vpadd.i32
d16
,
d16
,
d17
vpadd.i32
d18
,
d18
,
d19
vpadd.i32
d20
,
d20
,
d21
vpadd.i32
d22
,
d22
,
d23
vpadd.i32
d24
,
d24
,
d25
vpadd.i32
d26
,
d26
,
d27
vpadd.i32
d28
,
d28
,
d29
vpadd.i32
d30
,
d30
,
d31
vpadd.i32
d16
,
d16
,
d18
vpadd.i32
d17
,
d20
,
d22
vpadd.i32
d24
,
d24
,
d26
vpadd.i32
d25
,
d28
,
d30
vsub.i32
q8
,
q8
,
q0
vsub.i32
q12
,
q12
,
q1
cbz
r3
,
NoBias
cmp
r3
,
#
0
beq
NoBias
vld1.32
q2
,
[
r3
]
vadd.i32
q8
,
q8
,
q2
vadd.i32
q12
,
q12
,
q2
...
...
@@ -182,34 +191,34 @@ IndirectGemmInt8_2x4:
//
prefetching
is
not
prefered
while
writing
results
in
spite
of
cache
missings
//
you
could
try
prfm
pstl2strm
WriteStart
:
cmp
x
6
,
#
1
cmp
r
6
,
#
1
beq
Write1
cmp
x
6
,
#
2
cmp
r
6
,
#
2
beq
Write2
cmp
x
6
,
#
3
cmp
r
6
,
#
3
beq
Write3
b
Write4
Write1
:
vst1.8
{
d0
[
0
]},
[
x11
],
x
7
vst1.8
{
d0
[
1
]},
[
x
11
]
vst1.8
{
d0
[
0
]},
[
r11
],
r
7
vst1.8
{
d0
[
1
]},
[
r
11
]
add
r0
,
r0
,
#
1
b
WriteEnd
Write2
:
vst1.16
{
d0
[
0
]},
[
x11
],
x
7
vst1.16
{
d0
[
1
]},
[
x
11
]
vst1.16
{
d0
[
0
]},
[
r11
],
r
7
vst1.16
{
d0
[
1
]},
[
r
11
]
add
r0
,
r0
,
#
2
b
WriteEnd
Write3
:
add
x14
,
x
11
,
#
2
vst1.16
{
d0
[
0
]},
[
x11
],
x
7
vst1.16
{
d0
[
1
]},
[
x
11
]
vst1.8
{
d0
[
0
]},
[
x14
],
x
7
vst1.8
{
d0
[
1
]},
[
x
14
]
add
r14
,
r
11
,
#
2
vst1.16
{
d0
[
0
]},
[
r11
],
r
7
vst1.16
{
d0
[
1
]},
[
r
11
]
vst1.8
{
d0
[
0
]},
[
r14
],
r
7
vst1.8
{
d0
[
1
]},
[
r
14
]
add
r0
,
r0
,
#
3
b
WriteEnd
Write4
:
vst1.32
{
d0
[
0
]},
[
x11
],
x
7
vst1.32
{
d0
[
1
]},
[
x
11
]
vst1.32
{
d0
[
0
]},
[
r11
],
r
7
vst1.32
{
d0
[
1
]},
[
r
11
]
add
r0
,
r0
,
#
4
WriteEnd
:
...
...
@@ -218,7 +227,8 @@ IndirectGemmInt8_2x4:
bne
LoopKsize
subs
r6
,
r6
,
#
4
cbz
r3
,
NoStepFowrard
cmp
r3
,
#
0
beq
NoStepFowrard
add
r3
,
r3
,
#
16
NoStepFowrard
:
bgt
LoopOc
...
...
@@ -227,3 +237,4 @@ IndirectGemmInt8_2x4:
vpop
{
q4
-
q7
}
pop
{
r4
-
r8
,
r10
,
r11
,
pc
}
#endif
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAdd.S
0 → 100644
浏览文件 @
e52aa2b1
#ifdef __aarch64__
.
text
.
align
5
//.
p2align
5
,,
15
.
global
C4BiasAdd
#ifndef __APPLE__
.
type
C4BiasAdd
,
%
function
#endif
//
void
C4BiasAdd
(
float
*
dst
,
const
float
*
input
,
const
float
*
bias
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
)
//
x0
:
dst
,
x1
:
input
,
x2
:
bias
,
x3
:
oc
,
x4
:
plane_size
,
x5
:
stride
C4BiasAdd
:
LoopOc
:
ld1
{
v4
.4
s
},
[
x2
],
#
16
mov
x6
,
x4
mov
x7
,
x0
cmp
x6
,
#
4
blt
Loop1
Loop4
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
,
v3
.4
s
},
[
x1
],
#
64
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
fadd
v1
.4
s
,
v1
.4
s
,
v4
.4
s
fadd
v2
.4
s
,
v2
.4
s
,
v4
.4
s
fadd
v3
.4
s
,
v3
.4
s
,
v4
.4
s
cmp
x3
,
#
4
bge
Write4x4
cmp
x3
,
#
3
beq
Write3x4
cmp
x3
,
#
2
beq
Write2x4
Write1x4
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x5
str
s1
,
[
x7
]
add
x7
,
x7
,
x5
str
s2
,
[
x7
]
add
x7
,
x7
,
x5
str
s3
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEndx4
Write2x4
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x5
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x5
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEndx4
Write3x4
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v0
.
s
}[
2
],
[
x8
],
x5
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v1
.
s
}[
2
],
[
x8
],
x5
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v2
.
s
}[
2
],
[
x8
],
x5
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v3
.
s
}[
2
],
[
x8
],
x5
b
WriteEndx4
Write4x4
:
st1
{
v0
.4
s
},
[
x7
],
x5
st1
{
v1
.4
s
},
[
x7
],
x5
st1
{
v2
.4
s
},
[
x7
],
x5
st1
{
v3
.4
s
},
[
x7
],
x5
WriteEndx4
:
subs
x6
,
x6
,
#
4
beq
LoopOcEnd
cmp
x6
,
#
4
blt
Loop1
b
Loop4
Loop1
:
ld1
{
v0
.4
s
},
[
x1
],
#
16
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
cmp
x3
,
#
4
bge
Write4
cmp
x3
,
#
3
beq
Write3
cmp
x3
,
#
2
beq
Write2
Write1
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEnd
Write2
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEnd
Write3
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v0
.
s
}[
2
],
[
x8
],
x5
b
WriteEnd
Write4
:
st1
{
v0
.4
s
},
[
x7
],
x5
WriteEnd
:
subs
x6
,
x6
,
#
1
bne
Loop1
LoopOcEnd
:
subs
x3
,
x3
,
#
4
add
x0
,
x0
,
#
16
bgt
LoopOc
ret
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu.S
0 → 100644
浏览文件 @
e52aa2b1
#ifdef __aarch64__
.
text
.
align
5
//.
p2align
5
,,
15
.
global
C4BiasAddRelu
#ifndef __APPLE__
.
type
C4BiasAddRelu
,
%
function
#endif
//
void
C4BiasAddRelu
(
float
*
dst
,
const
float
*
input
,
const
float
*
bias
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
)
//
x0
:
dst
,
x1
:
input
,
x2
:
bias
,
x3
:
oc
,
x4
:
plane_size
,
x5
:
stride
C4BiasAddRelu
:
dup
v5
.4
s
,
wzr
LoopOc
:
ld1
{
v4
.4
s
},
[
x2
],
#
16
mov
x6
,
x4
mov
x7
,
x0
cmp
x6
,
#
4
blt
Loop1
Loop4
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
,
v3
.4
s
},
[
x1
],
#
64
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
fadd
v1
.4
s
,
v1
.4
s
,
v4
.4
s
fadd
v2
.4
s
,
v2
.4
s
,
v4
.4
s
fadd
v3
.4
s
,
v3
.4
s
,
v4
.4
s
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
fmax
v1
.4
s
,
v1
.4
s
,
v5
.4
s
fmax
v2
.4
s
,
v2
.4
s
,
v5
.4
s
fmax
v3
.4
s
,
v3
.4
s
,
v5
.4
s
cmp
x3
,
#
4
bge
Write4x4
cmp
x3
,
#
3
beq
Write3x4
cmp
x3
,
#
2
beq
Write2x4
Write1x4
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x5
str
s1
,
[
x7
]
add
x7
,
x7
,
x5
str
s2
,
[
x7
]
add
x7
,
x7
,
x5
str
s3
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEndx4
Write2x4
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x5
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x5
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEndx4
Write3x4
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v0
.
s
}[
2
],
[
x8
],
x5
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v1
.
s
}[
2
],
[
x8
],
x5
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v2
.
s
}[
2
],
[
x8
],
x5
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v3
.
s
}[
2
],
[
x8
],
x5
b
WriteEndx4
Write4x4
:
st1
{
v0
.4
s
},
[
x7
],
x5
st1
{
v1
.4
s
},
[
x7
],
x5
st1
{
v2
.4
s
},
[
x7
],
x5
st1
{
v3
.4
s
},
[
x7
],
x5
WriteEndx4
:
subs
x6
,
x6
,
#
4
beq
LoopOcEnd
cmp
x6
,
#
4
blt
Loop1
b
Loop4
Loop1
:
ld1
{
v0
.4
s
},
[
x1
],
#
16
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
cmp
x3
,
#
4
bge
Write4
cmp
x3
,
#
3
beq
Write3
cmp
x3
,
#
2
beq
Write2
Write1
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEnd
Write2
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEnd
Write3
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v0
.
s
}[
2
],
[
x8
],
x5
b
WriteEnd
Write4
:
st1
{
v0
.4
s
},
[
x7
],
x5
WriteEnd
:
subs
x6
,
x6
,
#
1
bne
Loop1
LoopOcEnd
:
subs
x3
,
x3
,
#
4
add
x0
,
x0
,
#
16
bgt
LoopOc
ret
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4BiasAddRelu6.S
0 → 100644
浏览文件 @
e52aa2b1
#ifdef __aarch64__
.
text
.
align
5
//.
p2align
5
,,
15
.
global
C4BiasAddRelu6
#ifndef __APPLE__
.
type
C4BiasAddRelu6
,
%
function
#endif
//
void
C4BiC4BiasAddRelu6asAdd
(
float
*
dst
,
const
float
*
input
,
const
float
*
bias
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
)
//
x0
:
dst
,
x1
:
input
,
x2
:
bias
,
x3
:
oc
,
x4
:
plane_size
,
x5
:
stride
C4BiasAddRelu6
:
dup
v5
.4
s
,
wzr
movi
v6
.4
s
,
#
6
scvtf
v6
.4
s
,
v6
.4
s
LoopOc
:
ld1
{
v4
.4
s
},
[
x2
],
#
16
mov
x6
,
x4
mov
x7
,
x0
cmp
x6
,
#
4
blt
Loop1
Loop4
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
,
v3
.4
s
},
[
x1
],
#
64
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
fadd
v1
.4
s
,
v1
.4
s
,
v4
.4
s
fadd
v2
.4
s
,
v2
.4
s
,
v4
.4
s
fadd
v3
.4
s
,
v3
.4
s
,
v4
.4
s
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
fmax
v1
.4
s
,
v1
.4
s
,
v5
.4
s
fmax
v2
.4
s
,
v2
.4
s
,
v5
.4
s
fmax
v3
.4
s
,
v3
.4
s
,
v5
.4
s
fmin
v0
.4
s
,
v0
.4
s
,
v6
.4
s
fmin
v1
.4
s
,
v1
.4
s
,
v6
.4
s
fmin
v2
.4
s
,
v2
.4
s
,
v6
.4
s
fmin
v3
.4
s
,
v3
.4
s
,
v6
.4
s
cmp
x3
,
#
4
bge
Write4x4
cmp
x3
,
#
3
beq
Write3x4
cmp
x3
,
#
2
beq
Write2x4
Write1x4
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x5
str
s1
,
[
x7
]
add
x7
,
x7
,
x5
str
s2
,
[
x7
]
add
x7
,
x7
,
x5
str
s3
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEndx4
Write2x4
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x5
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x5
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEndx4
Write3x4
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v0
.
s
}[
2
],
[
x8
],
x5
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v1
.
s
}[
2
],
[
x8
],
x5
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v2
.
s
}[
2
],
[
x8
],
x5
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v3
.
s
}[
2
],
[
x8
],
x5
b
WriteEndx4
Write4x4
:
st1
{
v0
.4
s
},
[
x7
],
x5
st1
{
v1
.4
s
},
[
x7
],
x5
st1
{
v2
.4
s
},
[
x7
],
x5
st1
{
v3
.4
s
},
[
x7
],
x5
WriteEndx4
:
subs
x6
,
x6
,
#
4
beq
LoopOcEnd
cmp
x6
,
#
4
blt
Loop1
b
Loop4
Loop1
:
ld1
{
v0
.4
s
},
[
x1
],
#
16
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
fmin
v0
.4
s
,
v0
.4
s
,
v6
.4
s
cmp
x3
,
#
4
bge
Write4
cmp
x3
,
#
3
beq
Write3
cmp
x3
,
#
2
beq
Write2
Write1
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEnd
Write2
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
b
WriteEnd
Write3
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x5
st1
{
v0
.
s
}[
2
],
[
x8
],
x5
b
WriteEnd
Write4
:
st1
{
v0
.4
s
},
[
x7
],
x5
WriteEnd
:
subs
x6
,
x6
,
#
1
bne
Loop1
LoopOcEnd
:
subs
x3
,
x3
,
#
4
add
x0
,
x0
,
#
16
bgt
LoopOc
ret
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu.S
0 → 100644
浏览文件 @
e52aa2b1
#ifdef __aarch64__
.
text
.
align
5
//.
p2align
5
,,
15
.
global
C4Relu
#ifndef __APPLE__
.
type
C4Relu
,
%
function
#endif
//
void
C4Relu
(
float
*
dst
,
const
float
*
input
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
)
//
x0
:
dst
,
x1
:
input
,
x2
:
oc
,
x3
:
plane_size
,
x4
:
stride
C4Relu
:
dup
v5
.4
s
,
wzr
LoopOc
:
mov
x6
,
x3
mov
x7
,
x0
cmp
x6
,
#
4
blt
Loop1
Loop4
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
,
v3
.4
s
},
[
x1
],
#
64
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
fmax
v1
.4
s
,
v1
.4
s
,
v5
.4
s
fmax
v2
.4
s
,
v2
.4
s
,
v5
.4
s
fmax
v3
.4
s
,
v3
.4
s
,
v5
.4
s
cmp
x2
,
#
4
bge
Write4x4
cmp
x2
,
#
3
beq
Write3x4
cmp
x2
,
#
2
beq
Write2x4
Write1x4
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x4
str
s1
,
[
x7
]
add
x7
,
x7
,
x4
str
s2
,
[
x7
]
add
x7
,
x7
,
x4
str
s3
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEndx4
Write2x4
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x4
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x4
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEndx4
Write3x4
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v0
.
s
}[
2
],
[
x8
],
x4
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v1
.
s
}[
2
],
[
x8
],
x4
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v2
.
s
}[
2
],
[
x8
],
x4
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v3
.
s
}[
2
],
[
x8
],
x4
b
WriteEndx4
Write4x4
:
st1
{
v0
.4
s
},
[
x7
],
x4
st1
{
v1
.4
s
},
[
x7
],
x4
st1
{
v2
.4
s
},
[
x7
],
x4
st1
{
v3
.4
s
},
[
x7
],
x4
WriteEndx4
:
subs
x6
,
x6
,
#
4
beq
LoopOcEnd
cmp
x6
,
#
4
blt
Loop1
b
Loop4
Loop1
:
ld1
{
v0
.4
s
},
[
x1
],
#
16
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
cmp
x2
,
#
4
bge
Write4
cmp
x2
,
#
3
beq
Write3
cmp
x2
,
#
2
beq
Write2
Write1
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEnd
Write2
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEnd
Write3
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v0
.
s
}[
2
],
[
x8
],
x4
b
WriteEnd
Write4
:
st1
{
v0
.4
s
},
[
x7
],
x4
WriteEnd
:
subs
x6
,
x6
,
#
1
bne
Loop1
LoopOcEnd
:
subs
x2
,
x2
,
#
4
add
x0
,
x0
,
#
16
bgt
LoopOc
ret
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/C4Relu6.S
0 → 100644
浏览文件 @
e52aa2b1
#ifdef __aarch64__
.
text
.
align
5
//.
p2align
5
,,
15
.
global
C4Relu6
#ifndef __APPLE__
.
type
C4Relu6
,
%
function
#endif
//
void
C4Relu6
(
float
*
dst
,
const
float
*
input
,
const
float
*
bias
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
)
//
x0
:
dst
,
x1
:
input
,
x2
:
oc
,
x2
:
plane_size
,
x3
:
stride
C4Relu6
:
dup
v5
.4
s
,
wzr
movi
v6
.4
s
,
#
6
scvtf
v6
.4
s
,
v6
.4
s
LoopOc
:
mov
x6
,
x3
mov
x7
,
x0
cmp
x6
,
#
4
blt
Loop1
Loop4
:
ld1
{
v0
.4
s
,
v1
.4
s
,
v2
.4
s
,
v3
.4
s
},
[
x1
],
#
64
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
fmax
v1
.4
s
,
v1
.4
s
,
v5
.4
s
fmax
v2
.4
s
,
v2
.4
s
,
v5
.4
s
fmax
v3
.4
s
,
v3
.4
s
,
v5
.4
s
fmin
v0
.4
s
,
v0
.4
s
,
v6
.4
s
fmin
v1
.4
s
,
v1
.4
s
,
v6
.4
s
fmin
v2
.4
s
,
v2
.4
s
,
v6
.4
s
fmin
v3
.4
s
,
v3
.4
s
,
v6
.4
s
cmp
x2
,
#
4
bge
Write4x4
cmp
x2
,
#
3
beq
Write3x4
cmp
x2
,
#
2
beq
Write2x4
Write1x4
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x4
str
s1
,
[
x7
]
add
x7
,
x7
,
x4
str
s2
,
[
x7
]
add
x7
,
x7
,
x4
str
s3
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEndx4
Write2x4
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x4
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x4
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEndx4
Write3x4
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v0
.
s
}[
2
],
[
x8
],
x4
dup
s17
,
v1
.
s
[
1
]
stp
s1
,
s17
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v1
.
s
}[
2
],
[
x8
],
x4
dup
s18
,
v2
.
s
[
1
]
stp
s2
,
s18
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v2
.
s
}[
2
],
[
x8
],
x4
dup
s19
,
v3
.
s
[
1
]
stp
s3
,
s19
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v3
.
s
}[
2
],
[
x8
],
x4
b
WriteEndx4
Write4x4
:
st1
{
v0
.4
s
},
[
x7
],
x4
st1
{
v1
.4
s
},
[
x7
],
x4
st1
{
v2
.4
s
},
[
x7
],
x4
st1
{
v3
.4
s
},
[
x7
],
x4
WriteEndx4
:
subs
x6
,
x6
,
#
4
beq
LoopOcEnd
cmp
x6
,
#
4
blt
Loop1
b
Loop4
Loop1
:
ld1
{
v0
.4
s
},
[
x1
],
#
16
fadd
v0
.4
s
,
v0
.4
s
,
v4
.4
s
fmax
v0
.4
s
,
v0
.4
s
,
v5
.4
s
fmin
v0
.4
s
,
v0
.4
s
,
v6
.4
s
cmp
x2
,
#
4
bge
Write4
cmp
x2
,
#
3
beq
Write3
cmp
x2
,
#
2
beq
Write2
Write1
:
str
s0
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEnd
Write2
:
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
b
WriteEnd
Write3
:
add
x8
,
x7
,
#
8
dup
s16
,
v0
.
s
[
1
]
stp
s0
,
s16
,
[
x7
]
add
x7
,
x7
,
x4
st1
{
v0
.
s
}[
2
],
[
x8
],
x4
b
WriteEnd
Write4
:
st1
{
v0
.4
s
},
[
x7
],
x4
WriteEnd
:
subs
x6
,
x6
,
#
1
bne
Loop1
LoopOcEnd
:
subs
x2
,
x2
,
#
4
add
x0
,
x0
,
#
16
bgt
LoopOc
ret
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/ConvDwFp32Center.S
0 → 100644
浏览文件 @
e52aa2b1
#ifdef __aarch64__
.
text
.
align
5
.
global
ConvDwFp32Center
#ifndef __APPLE__
.
type
ConvDwFp32Center
,
%
function
#endif
//
void
ConvDwFp32Center
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
const
float
*
bias
,
size_t
height
,
size_t
width
,
//
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
//
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
relu
,
size_t
relu6
)
;
//
x0
:
dst
,
x1
:
src
,
x2
:
weight
,
x3
:
bias
,
x4
:
height
,
x5
:
weight
,
x6
:
kernel_h
,
x7
:
kernel_w
,
//
x8
:
out_h_step
,
x9
:
block_channel
,
x10
:
in_sh_step
,
x11
:
in_sw_step
,
x12
:
in_kh_step
,
x13
:
in_kw_step
//
x14
:
relu
,
x15
:
relu6
ConvDwFp32Center
:
//
registers
v8
~
v15
must
be
preserved
by
a
callee
across
subroutine
calls
,
according
to
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
sub
sp
,
sp
,
#
48
stp
x19
,
x20
,
[
sp
],
#
16
stp
x21
,
x22
,
[
sp
],
#
16
stp
x23
,
x24
,
[
sp
],
#
16
ldr
x8
,
[
sp
]
ldr
x9
,
[
sp
,
#
8
]
ldr
x10
,
[
sp
,
#
16
]
ldr
x11
,
[
sp
,
#
24
]
ldr
x12
,
[
sp
,
#
32
]
ldr
x13
,
[
sp
,
#
40
]
ldr
x14
,
[
sp
,
#
48
]
ldr
x15
,
[
sp
,
#
56
]
mov
x16
,
#
4
mul
x8
,
x8
,
x16
mul
x9
,
x9
,
x16
mul
x10
,
x10
,
x16
mul
x11
,
x11
,
x16
mul
x12
,
x12
,
x16
mul
x13
,
x13
,
x16
mov
x16
,
#
16
mul
x19
,
x7
,
x16
ld1
{
v5
.4
s
},
[
x3
]
LoopH
:
mov
x23
,
x1
mov
x24
,
x5
mov
x3
,
x0
LoopW
:
mov
x16
,
x23
mov
x17
,
x2
mov
x20
,
x6
ld1
{
v0
.4
s
},
[
x3
]
fadd
v0
.4
s
,
v0
.4
s
,
v5
.4
s
LoopKh
:
mov
x18
,
x7
mov
x21
,
x17
mov
x22
,
x16
LoopKw
:
ld1
{
v1
.4
s
},
[
x22
],
x13
ld1
{
v2
.4
s
},
[
x21
],
#
16
fmla
v0
.4
s
,
v1
.4
s
,
v2
.4
s
subs
x18
,
x18
,
#
1
bne
LoopKw
add
x16
,
x16
,
x12
add
x17
,
x17
,
x19
subs
x20
,
x20
,
#
1
bne
LoopKh
cbnz
x15
,
Relu6
cbnz
x14
,
Relu
b
Write
Relu6
:
movi
v4
.4
s
,
#
6
scvtf
v4
.4
s
,
v4
.4
s
fmin
v0
.4
s
,
v0
.4
s
,
v4
.4
s
Relu
:
dup
v3
.4
s
,
wzr
fmax
v0
.4
s
,
v0
.4
s
,
v3
.4
s
Write
:
st1
{
v0
.4
s
},
[
x3
],
x9
add
x23
,
x23
,
x11
subs
x24
,
x24
,
#
1
bne
LoopW
add
x0
,
x0
,
x8
add
x1
,
x1
,
x10
subs
x4
,
x4
,
#
1
bne
LoopH
sub
sp
,
sp
,
#
48
ldp
x19
,
x20
,
[
sp
],
#
16
ldp
x21
,
x22
,
[
sp
],
#
16
ldp
x23
,
x24
,
[
sp
],
#
16
ret
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/assembly/arm64/DeconvDwFp32Center.S
0 → 100644
浏览文件 @
e52aa2b1
#ifdef __aarch64__
.
text
.
align
5
.
global
DeconvDwFp32Center
#ifndef __APPLE__
.
type
DeconvDwFp32Center
,
%
function
#endif
//
void
DeconvDwFp32Center
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
size_t
height
,
size_t
width
,
//
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
//
size_t
in_kh_step
,
size_t
in_kw_step
)
;
//
x0
:
dst
,
x1
:
src
,
x2
:
weight
,
x3
:
height
,
x4
:
weight
,
x5
:
kernel_h
,
x6
:
kernel_w
,
x7
:
out_h_step
//
x8
:
block_channel
,
x9
:
in_sh_step
,
x10
:
in_sw_step
,
x11
:
in_kh_step
,
x12
:
in_kw_step
DeconvDwFp32Center
:
//
registers
v8
~
v15
must
be
preserved
by
a
callee
across
subroutine
calls
,
according
to
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
sub
sp
,
sp
,
#
32
stp
x19
,
x20
,
[
sp
],
#
16
stp
x21
,
x22
,
[
sp
],
#
16
ldr
x8
,
[
sp
]
ldr
x9
,
[
sp
,
#
8
]
ldr
x10
,
[
sp
,
#
16
]
ldr
x11
,
[
sp
,
#
24
]
ldr
x12
,
[
sp
,
#
32
]
mov
x13
,
#
4
mul
x7
,
x7
,
x13
mul
x8
,
x8
,
x13
mul
x9
,
x9
,
x13
mul
x10
,
x10
,
x13
mul
x11
,
x11
,
x13
mul
x12
,
x12
,
x13
mov
x13
,
#
16
mul
x14
,
x6
,
x13
LoopH
:
mov
x15
,
x0
mov
x16
,
x1
mov
x17
,
x4
LoopW
:
mov
x18
,
x15
mov
x19
,
x2
mov
x20
,
x5
LoopKh
:
mov
x21
,
x18
mov
x22
,
x19
mov
x13
,
x6
LoopKw
:
ld1
{
v0
.4
s
},
[
x21
]
ld1
{
v1
.4
s
},
[
x16
]
ld1
{
v2
.4
s
},
[
x22
],
#
16
fmla
v0
.4
s
,
v1
.4
s
,
v2
.4
s
st1
{
v0
.4
s
},
[
x21
],
x12
subs
x13
,
x13
,
#
1
bne
LoopKw
add
x18
,
x18
,
x11
add
x19
,
x19
,
x14
subs
x20
,
x20
,
#
1
bne
LoopKh
add
x15
,
x15
,
x10
add
x16
,
x16
,
x8
subs
x17
,
x17
,
#
1
bne
LoopW
add
x0
,
x0
,
x9
add
x1
,
x1
,
x7
subs
x3
,
x3
,
#
1
bne
LoopH
sub
sp
,
sp
,
#
32
ldp
x19
,
x20
,
[
sp
],
#
16
ldp
x21
,
x22
,
[
sp
],
#
16
ret
#endif
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.cc
浏览文件 @
e52aa2b1
...
...
@@ -81,20 +81,19 @@ void PostConvFuncFp32(const float *c4_out_ptr, float *out_ptr, const float *bias
}
}
#else
int
oc4
=
UP_DIV
(
output_channel
,
C4NUM
);
if
(
bias_ptr
!=
nullptr
)
{
if
(
is_relu
)
{
BiasAddRelu
(
bias_ptr
,
out_ptr
,
oc4
,
plane_size
);
C4BiasAddRelu
(
out_ptr
,
c4_out_ptr
,
bias_ptr
,
output_channel
,
plane_size
,
stride
*
sizeof
(
float
)
);
}
else
if
(
is_relu6
)
{
BiasAddRelu6
(
bias_ptr
,
out_ptr
,
oc4
,
plane_size
);
C4BiasAddRelu6
(
out_ptr
,
c4_out_ptr
,
bias_ptr
,
output_channel
,
plane_size
,
stride
*
sizeof
(
float
)
);
}
else
{
BiasAdd
(
bias_ptr
,
out_ptr
,
oc4
,
plane_size
);
C4BiasAdd
(
out_ptr
,
c4_out_ptr
,
bias_ptr
,
output_channel
,
plane_size
,
stride
*
sizeof
(
float
)
);
}
}
else
{
if
(
is_relu
)
{
Relu
(
out_ptr
,
oc4
*
plane_size
);
C4Relu
(
out_ptr
,
c4_out_ptr
,
output_channel
,
plane_size
,
stride
*
sizeof
(
float
)
);
}
else
if
(
is_relu6
)
{
Relu6
(
out_ptr
,
oc4
*
plane_size
);
C4Relu6
(
out_ptr
,
c4_out_ptr
,
output_channel
,
plane_size
,
stride
*
sizeof
(
float
)
);
}
else
{
// do nothing
}
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/common_func.h
浏览文件 @
e52aa2b1
...
...
@@ -42,6 +42,17 @@ void BiasAddRelu6(const float *bias, float *data, size_t oc4, size_t plan_size);
void
BiasAddRelu
(
const
float
*
bias
,
float
*
data
,
size_t
oc4
,
size_t
plan_size
);
void
Relu6
(
float
*
data
,
size_t
element4
);
void
Relu
(
float
*
data
,
size_t
element4
);
void
C4BiasAdd
(
float
*
dst
,
const
float
*
input
,
const
float
*
bias
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
);
void
C4BiasAddRelu
(
float
*
dst
,
const
float
*
input
,
const
float
*
bias
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
);
void
C4BiasAddRelu6
(
float
*
dst
,
const
float
*
input
,
const
float
*
bias
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
);
void
C4Relu
(
float
*
dst
,
const
float
*
input
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
);
void
C4Relu6
(
float
*
dst
,
const
float
*
input
,
size_t
oc
,
size_t
plane_size
,
size_t
stride
);
void
ConvDwFp32Center
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
const
float
*
bias
,
size_t
height
,
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
relu
,
size_t
relu6
);
void
DeconvDwFp32Center
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
size_t
height
,
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
);
#endif
#ifdef __cplusplus
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc
浏览文件 @
e52aa2b1
...
...
@@ -15,6 +15,7 @@
*/
#include "src/runtime/kernel/arm/opclib/fp32/conv_depthwise.h"
#include "src/runtime/kernel/arm/opclib/fp32/common_func.h"
#ifdef ENABLE_ARM64
#include <arm_neon.h>
#endif
...
...
@@ -122,6 +123,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
void
DepthwiseCenter
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
const
float
*
bias
,
int
height
,
int
width
,
int
kernel_h
,
int
kernel_w
,
int
out_h_step
,
int
block_channel
,
int
in_sh_step
,
int
in_sw_step
,
int
in_kh_step
,
int
in_kw_step
,
bool
is_relu
,
bool
is_relu6
)
{
#ifdef ENABLE_ARM64
ConvDwFp32Center
(
dst
,
src
,
weight
,
bias
,
height
,
width
,
kernel_h
,
kernel_w
,
out_h_step
,
block_channel
,
in_sh_step
,
in_sw_step
,
in_kh_step
,
in_kw_step
,
is_relu
,
is_relu6
);
#else
float
*
dst_h
=
dst
;
const
float
*
src_h
=
src
;
for
(
int
oh
=
0
;
oh
<
height
;
oh
++
)
{
...
...
@@ -163,6 +168,7 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
dst_h
+=
out_h_step
;
src_h
+=
in_sh_step
;
}
// dst_height loop
#endif
}
// conv depthwise fp32: sliding window
...
...
@@ -262,6 +268,10 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in
void
DeconvDepthwiseCenter
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
int
height
,
int
width
,
int
kernel_h
,
int
kernel_w
,
int
out_h_step
,
int
block_channel
,
int
in_sh_step
,
int
in_sw_step
,
int
in_kh_step
,
int
in_kw_step
)
{
#ifdef ENABLE_ARM64
DeconvDwFp32Center
(
dst
,
src
,
weight
,
height
,
width
,
kernel_h
,
kernel_w
,
out_h_step
,
block_channel
,
in_sh_step
,
in_sw_step
,
in_kh_step
,
in_kw_step
);
#else
float
*
dst_h
=
dst
;
const
float
*
src_h
=
src
;
for
(
int
oh
=
0
;
oh
<
height
;
oh
++
)
{
...
...
@@ -297,6 +307,7 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in
dst_h
+=
in_sh_step
;
src_h
+=
out_h_step
;
}
// dst_height loop
#endif
}
void
DeconvDepthwisePostFunc
(
float
*
dst
,
const
float
*
bias
,
int
block_channel
,
const
ConvParameter
*
conv_param
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录