Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
0ec5a570
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0ec5a570
编写于
8月 19, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 19, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4739 [MS][LITE][Develop]add fp32 sliding window kernel
Merge pull request !4739 from lixian/master
上级
9ce6b36e
a5bd2548
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
481 addition
and
8 deletion
+481
-8
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+1
-2
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
...untime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
+6
-2
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S
...untime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S
+6
-2
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvFp32Center.S
.../runtime/kernel/arm/nnacl/assembly/arm64/ConvFp32Center.S
+446
-0
mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
...pore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
+5
-0
mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
+17
-2
未找到文件。
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
浏览文件 @
0ec5a570
...
...
@@ -258,8 +258,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionWinogradCPUKernel
(
op_parameter
,
inputs
,
outputs
,
ctx
,
primitive
,
out_unit
);
}
else
if
(
use_sw
)
{
// kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionCPUKernel
(
op_parameter
,
inputs
,
outputs
,
ctx
,
primitive
);
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionSWCPUKernel
(
op_parameter
,
inputs
,
outputs
,
ctx
,
primitive
);
}
else
{
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionCPUKernel
(
op_parameter
,
inputs
,
outputs
,
ctx
,
primitive
);
}
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwFp32Center.S
浏览文件 @
0ec5a570
...
...
@@ -18,7 +18,9 @@ ConvDwFp32Center:
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
sub
sp
,
sp
,
#
48
sub
sp
,
sp
,
#
176
st1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
st1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
stp
x19
,
x20
,
[
sp
],
#
16
stp
x21
,
x22
,
[
sp
],
#
16
stp
x23
,
x24
,
[
sp
],
#
16
...
...
@@ -287,7 +289,9 @@ ConvDwFp32Center:
subs
x4
,
x4
,
#
1
bne
LoopH
sub
sp
,
sp
,
#
48
sub
sp
,
sp
,
#
176
ld1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
ld1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
ldp
x19
,
x20
,
[
sp
],
#
16
ldp
x21
,
x22
,
[
sp
],
#
16
ldp
x23
,
x24
,
[
sp
],
#
16
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvDwInt8Center.S
浏览文件 @
0ec5a570
...
...
@@ -19,7 +19,9 @@ ConvDwInt8Center:
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
sub
sp
,
sp
,
#
48
sub
sp
,
sp
,
#
176
st1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
st1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
stp
x19
,
x20
,
[
sp
],
#
16
stp
x21
,
x22
,
[
sp
],
#
16
stp
x23
,
x24
,
[
sp
],
#
16
...
...
@@ -631,7 +633,9 @@ ConvDwInt8Center:
subs
x4
,
x4
,
#
1
bne
LoopH
sub
sp
,
sp
,
#
48
sub
sp
,
sp
,
#
176
ld1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
ld1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
ldp
x19
,
x20
,
[
sp
],
#
16
ldp
x21
,
x22
,
[
sp
],
#
16
ldp
x23
,
x24
,
[
sp
],
#
16
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/arm64/ConvFp32Center.S
0 → 100644
浏览文件 @
0ec5a570
#ifdef __aarch64__
.
text
.
align
5
.
global
ConvSwFp32Center
#ifndef __APPLE__
.
type
ConvSwFp32Center
,
%
function
#endif
//
void
ConvSwFp32Center
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
const
float
*
bias
,
size_t
height
,
size_t
width
,
//
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
ic4
,
size_t
in_sh_step
,
//
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
relu
,
size_t
relu6
)
;
//
x0
:
dst
,
x1
:
src
,
x2
:
weight
,
x3
:
bias
,
x4
:
height
,
x5
:
width
,
x6
:
kernel_h
,
x7
:
kernel_w
,
//
x8
:
out_h_step
,
x9
:
block_channel
,
x10
:
ic4
,
x11
:
in_sh_step
,
x12
:
in_sw_step
,
x13
:
in_kh_step
,
x14
:
in_kw_step
//
x26
:
relu
,
x16
:
relu6
ConvSwFp32Center
:
//
registers
v8
~
v15
must
be
preserved
by
a
callee
across
subroutine
calls
,
according
to
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
sub
sp
,
sp
,
#
208
st1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
st1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
stp
x19
,
x20
,
[
sp
],
#
16
stp
x21
,
x22
,
[
sp
],
#
16
stp
x23
,
x24
,
[
sp
],
#
16
stp
x25
,
x26
,
[
sp
],
#
16
stp
x27
,
x28
,
[
sp
],
#
16
ldr
x8
,
[
sp
]
ldr
x9
,
[
sp
,
#
8
]
ldr
x10
,
[
sp
,
#
16
]
ldr
x11
,
[
sp
,
#
24
]
ldr
x12
,
[
sp
,
#
32
]
ldr
x13
,
[
sp
,
#
40
]
ldr
x14
,
[
sp
,
#
48
]
mul
x15
,
x6
,
x7
mul
x15
,
x10
,
x15
mov
x16
,
#
16
mul
x15
,
x15
,
x16
ld1
{
v25
.4
s
},
[
x3
]
movi
v26
.4
s
,
#
6
scvtf
v26
.4
s
,
v26
.4
s
dup
v27
.4
s
,
wzr
LoopH
:
mov
x17
,
x1
mov
x18
,
x5
mov
x3
,
x0
cmp
x18
,
#
8
blt
LoopW
cmp
x18
,
#
16
blt
LoopW8
LoopW16
:
mov
x19
,
#
16
mul
x19
,
x19
,
x12
mov
x20
,
x17
mov
x21
,
x2
mov
x22
,
x6
mov
v0
.16
b
,
v25
.16
b
mov
v1
.16
b
,
v25
.16
b
mov
v2
.16
b
,
v25
.16
b
mov
v3
.16
b
,
v25
.16
b
mov
v4
.16
b
,
v25
.16
b
mov
v5
.16
b
,
v25
.16
b
mov
v6
.16
b
,
v25
.16
b
mov
v7
.16
b
,
v25
.16
b
mov
v8
.16
b
,
v25
.16
b
mov
v9
.16
b
,
v25
.16
b
mov
v10
.16
b
,
v25
.16
b
mov
v11
.16
b
,
v25
.16
b
mov
v12
.16
b
,
v25
.16
b
mov
v13
.16
b
,
v25
.16
b
mov
v14
.16
b
,
v25
.16
b
mov
v15
.16
b
,
v25
.16
b
LoopKh16
:
mov
x23
,
x7
mov
x24
,
x20
LoopKw16
:
mov
x25
,
x24
mov
x27
,
x10
LoopIc16
:
mov
x26
,
x25
mov
x16
,
x21
ld1
{
v28
.4
s
},
[
x16
],
x15
ld1
{
v29
.4
s
},
[
x16
],
x15
ld1
{
v30
.4
s
},
[
x16
],
x15
ld1
{
v31
.4
s
},
[
x16
],
x15
zip1
v20
.4
s
,
v28
.4
s
,
v29
.4
s
zip2
v21
.4
s
,
v28
.4
s
,
v29
.4
s
zip1
v22
.4
s
,
v30
.4
s
,
v31
.4
s
zip2
v23
.4
s
,
v30
.4
s
,
v31
.4
s
ld1
{
v16
.4
s
},
[
x26
],
x12
ld1
{
v17
.4
s
},
[
x26
],
x12
trn1
v28
.2
d
,
v20
.2
d
,
v22
.2
d
trn2
v29
.2
d
,
v20
.2
d
,
v22
.2
d
trn1
v30
.2
d
,
v21
.2
d
,
v23
.2
d
trn2
v31
.2
d
,
v21
.2
d
,
v23
.2
d
ld1
{
v18
.4
s
},
[
x26
],
x12
ld1
{
v19
.4
s
},
[
x26
],
x12
fmla
v0
.4
s
,
v28
.4
s
,
v16
.
s
[
0
]
fmla
v1
.4
s
,
v28
.4
s
,
v17
.
s
[
0
]
fmla
v0
.4
s
,
v29
.4
s
,
v16
.
s
[
1
]
fmla
v1
.4
s
,
v29
.4
s
,
v17
.
s
[
1
]
fmla
v0
.4
s
,
v30
.4
s
,
v16
.
s
[
2
]
fmla
v1
.4
s
,
v30
.4
s
,
v17
.
s
[
2
]
fmla
v0
.4
s
,
v31
.4
s
,
v16
.
s
[
3
]
fmla
v1
.4
s
,
v31
.4
s
,
v17
.
s
[
3
]
ld1
{
v20
.4
s
},
[
x26
],
x12
ld1
{
v21
.4
s
},
[
x26
],
x12
fmla
v2
.4
s
,
v28
.4
s
,
v18
.
s
[
0
]
fmla
v3
.4
s
,
v28
.4
s
,
v19
.
s
[
0
]
fmla
v2
.4
s
,
v29
.4
s
,
v18
.
s
[
1
]
fmla
v3
.4
s
,
v29
.4
s
,
v19
.
s
[
1
]
fmla
v2
.4
s
,
v30
.4
s
,
v18
.
s
[
2
]
fmla
v3
.4
s
,
v30
.4
s
,
v19
.
s
[
2
]
fmla
v2
.4
s
,
v31
.4
s
,
v18
.
s
[
3
]
fmla
v3
.4
s
,
v31
.4
s
,
v19
.
s
[
3
]
ld1
{
v22
.4
s
},
[
x26
],
x12
ld1
{
v23
.4
s
},
[
x26
],
x12
fmla
v4
.4
s
,
v28
.4
s
,
v20
.
s
[
0
]
fmla
v5
.4
s
,
v28
.4
s
,
v21
.
s
[
0
]
fmla
v4
.4
s
,
v29
.4
s
,
v20
.
s
[
1
]
fmla
v5
.4
s
,
v29
.4
s
,
v21
.
s
[
1
]
fmla
v4
.4
s
,
v30
.4
s
,
v20
.
s
[
2
]
fmla
v5
.4
s
,
v30
.4
s
,
v21
.
s
[
2
]
fmla
v4
.4
s
,
v31
.4
s
,
v20
.
s
[
3
]
fmla
v5
.4
s
,
v31
.4
s
,
v21
.
s
[
3
]
ld1
{
v16
.4
s
},
[
x26
],
x12
ld1
{
v17
.4
s
},
[
x26
],
x12
fmla
v6
.4
s
,
v28
.4
s
,
v22
.
s
[
0
]
fmla
v7
.4
s
,
v28
.4
s
,
v23
.
s
[
0
]
fmla
v6
.4
s
,
v29
.4
s
,
v22
.
s
[
1
]
fmla
v7
.4
s
,
v29
.4
s
,
v23
.
s
[
1
]
fmla
v6
.4
s
,
v30
.4
s
,
v22
.
s
[
2
]
fmla
v7
.4
s
,
v30
.4
s
,
v23
.
s
[
2
]
fmla
v6
.4
s
,
v31
.4
s
,
v22
.
s
[
3
]
fmla
v7
.4
s
,
v31
.4
s
,
v23
.
s
[
3
]
ld1
{
v18
.4
s
},
[
x26
],
x12
ld1
{
v19
.4
s
},
[
x26
],
x12
fmla
v8
.4
s
,
v28
.4
s
,
v16
.
s
[
0
]
fmla
v9
.4
s
,
v28
.4
s
,
v17
.
s
[
0
]
fmla
v8
.4
s
,
v29
.4
s
,
v16
.
s
[
1
]
fmla
v9
.4
s
,
v29
.4
s
,
v17
.
s
[
1
]
fmla
v8
.4
s
,
v30
.4
s
,
v16
.
s
[
2
]
fmla
v9
.4
s
,
v30
.4
s
,
v17
.
s
[
2
]
fmla
v8
.4
s
,
v31
.4
s
,
v16
.
s
[
3
]
fmla
v9
.4
s
,
v31
.4
s
,
v17
.
s
[
3
]
ld1
{
v20
.4
s
},
[
x26
],
x12
ld1
{
v21
.4
s
},
[
x26
],
x12
fmla
v10
.4
s
,
v28
.4
s
,
v18
.
s
[
0
]
fmla
v11
.4
s
,
v28
.4
s
,
v19
.
s
[
0
]
fmla
v10
.4
s
,
v29
.4
s
,
v18
.
s
[
1
]
fmla
v11
.4
s
,
v29
.4
s
,
v19
.
s
[
1
]
fmla
v10
.4
s
,
v30
.4
s
,
v18
.
s
[
2
]
fmla
v11
.4
s
,
v30
.4
s
,
v19
.
s
[
2
]
fmla
v10
.4
s
,
v31
.4
s
,
v18
.
s
[
3
]
fmla
v11
.4
s
,
v31
.4
s
,
v19
.
s
[
3
]
ld1
{
v22
.4
s
},
[
x26
],
x12
ld1
{
v23
.4
s
},
[
x26
],
x12
fmla
v12
.4
s
,
v28
.4
s
,
v20
.
s
[
0
]
fmla
v13
.4
s
,
v28
.4
s
,
v21
.
s
[
0
]
fmla
v12
.4
s
,
v29
.4
s
,
v20
.
s
[
1
]
fmla
v13
.4
s
,
v29
.4
s
,
v21
.
s
[
1
]
fmla
v12
.4
s
,
v30
.4
s
,
v20
.
s
[
2
]
fmla
v13
.4
s
,
v30
.4
s
,
v21
.
s
[
2
]
fmla
v12
.4
s
,
v31
.4
s
,
v20
.
s
[
3
]
fmla
v13
.4
s
,
v31
.4
s
,
v21
.
s
[
3
]
fmla
v14
.4
s
,
v28
.4
s
,
v22
.
s
[
0
]
fmla
v15
.4
s
,
v28
.4
s
,
v23
.
s
[
0
]
fmla
v14
.4
s
,
v29
.4
s
,
v22
.
s
[
1
]
fmla
v15
.4
s
,
v29
.4
s
,
v23
.
s
[
1
]
fmla
v14
.4
s
,
v30
.4
s
,
v22
.
s
[
2
]
fmla
v15
.4
s
,
v30
.4
s
,
v23
.
s
[
2
]
fmla
v14
.4
s
,
v31
.4
s
,
v22
.
s
[
3
]
fmla
v15
.4
s
,
v31
.4
s
,
v23
.
s
[
3
]
add
x21
,
x21
,
#
16
add
x25
,
x25
,
#
16
subs
x27
,
x27
,
#
1
bgt
LoopIc16
subs
x23
,
x23
,
#
1
add
x24
,
x24
,
x14
bne
LoopKw16
add
x20
,
x20
,
x13
subs
x22
,
x22
,
#
1
bne
LoopKh16
ldr
x16
,
[
sp
,
#
64
]
cbnz
x16
,
Relu616
ldr
x26
,
[
sp
,
#
56
]
cbnz
x26
,
Relu16
b
Write16
Relu616
:
fmin
v0
.4
s
,
v0
.4
s
,
v26
.4
s
fmin
v1
.4
s
,
v1
.4
s
,
v26
.4
s
fmin
v2
.4
s
,
v2
.4
s
,
v26
.4
s
fmin
v3
.4
s
,
v3
.4
s
,
v26
.4
s
fmin
v4
.4
s
,
v4
.4
s
,
v26
.4
s
fmin
v5
.4
s
,
v5
.4
s
,
v26
.4
s
fmin
v6
.4
s
,
v6
.4
s
,
v26
.4
s
fmin
v7
.4
s
,
v7
.4
s
,
v26
.4
s
fmin
v8
.4
s
,
v8
.4
s
,
v26
.4
s
fmin
v9
.4
s
,
v9
.4
s
,
v26
.4
s
fmin
v10
.4
s
,
v10
.4
s
,
v26
.4
s
fmin
v11
.4
s
,
v11
.4
s
,
v26
.4
s
fmin
v12
.4
s
,
v12
.4
s
,
v26
.4
s
fmin
v13
.4
s
,
v13
.4
s
,
v26
.4
s
fmin
v14
.4
s
,
v14
.4
s
,
v26
.4
s
fmin
v15
.4
s
,
v15
.4
s
,
v26
.4
s
Relu16
:
fmax
v0
.4
s
,
v0
.4
s
,
v27
.4
s
fmax
v1
.4
s
,
v1
.4
s
,
v27
.4
s
fmax
v2
.4
s
,
v2
.4
s
,
v27
.4
s
fmax
v3
.4
s
,
v3
.4
s
,
v27
.4
s
fmax
v4
.4
s
,
v4
.4
s
,
v27
.4
s
fmax
v5
.4
s
,
v5
.4
s
,
v27
.4
s
fmax
v6
.4
s
,
v6
.4
s
,
v27
.4
s
fmax
v7
.4
s
,
v7
.4
s
,
v27
.4
s
fmax
v8
.4
s
,
v8
.4
s
,
v27
.4
s
fmax
v9
.4
s
,
v9
.4
s
,
v27
.4
s
fmax
v10
.4
s
,
v10
.4
s
,
v27
.4
s
fmax
v11
.4
s
,
v11
.4
s
,
v27
.4
s
fmax
v12
.4
s
,
v12
.4
s
,
v27
.4
s
fmax
v13
.4
s
,
v13
.4
s
,
v27
.4
s
fmax
v14
.4
s
,
v14
.4
s
,
v27
.4
s
fmax
v15
.4
s
,
v15
.4
s
,
v27
.4
s
Write16
:
st1
{
v0
.4
s
},
[
x3
],
x9
st1
{
v1
.4
s
},
[
x3
],
x9
st1
{
v2
.4
s
},
[
x3
],
x9
st1
{
v3
.4
s
},
[
x3
],
x9
st1
{
v4
.4
s
},
[
x3
],
x9
st1
{
v5
.4
s
},
[
x3
],
x9
st1
{
v6
.4
s
},
[
x3
],
x9
st1
{
v7
.4
s
},
[
x3
],
x9
st1
{
v8
.4
s
},
[
x3
],
x9
st1
{
v9
.4
s
},
[
x3
],
x9
st1
{
v10
.4
s
},
[
x3
],
x9
st1
{
v11
.4
s
},
[
x3
],
x9
st1
{
v12
.4
s
},
[
x3
],
x9
st1
{
v13
.4
s
},
[
x3
],
x9
st1
{
v14
.4
s
},
[
x3
],
x9
st1
{
v15
.4
s
},
[
x3
],
x9
add
x17
,
x17
,
x19
sub
x18
,
x18
,
#
16
cmp
x18
,
#
0
ble
LoopWEnd
cmp
x18
,
#
8
blt
LoopW
cmp
x18
,
#
16
bge
LoopW16
LoopW8
:
mov
x19
,
#
8
mul
x19
,
x19
,
x12
mov
x20
,
x17
mov
x21
,
x2
mov
x22
,
x6
mov
v0
.16
b
,
v25
.16
b
mov
v1
.16
b
,
v25
.16
b
mov
v2
.16
b
,
v25
.16
b
mov
v3
.16
b
,
v25
.16
b
mov
v4
.16
b
,
v25
.16
b
mov
v5
.16
b
,
v25
.16
b
mov
v6
.16
b
,
v25
.16
b
mov
v7
.16
b
,
v25
.16
b
LoopKh8
:
mov
x23
,
x7
mov
x24
,
x20
LoopKw8
:
mov
x25
,
x24
mov
x27
,
x10
LoopIc8
:
mov
x26
,
x25
mov
x16
,
x21
ld1
{
v28
.4
s
},
[
x16
],
x15
ld1
{
v29
.4
s
},
[
x16
],
x15
ld1
{
v30
.4
s
},
[
x16
],
x15
ld1
{
v31
.4
s
},
[
x16
],
x15
zip1
v20
.4
s
,
v28
.4
s
,
v29
.4
s
zip2
v21
.4
s
,
v28
.4
s
,
v29
.4
s
zip1
v22
.4
s
,
v30
.4
s
,
v31
.4
s
zip2
v23
.4
s
,
v30
.4
s
,
v31
.4
s
ld1
{
v16
.4
s
},
[
x26
],
x12
ld1
{
v17
.4
s
},
[
x26
],
x12
trn1
v28
.2
d
,
v20
.2
d
,
v22
.2
d
trn2
v29
.2
d
,
v20
.2
d
,
v22
.2
d
trn1
v30
.2
d
,
v21
.2
d
,
v23
.2
d
trn2
v31
.2
d
,
v21
.2
d
,
v23
.2
d
ld1
{
v18
.4
s
},
[
x26
],
x12
ld1
{
v19
.4
s
},
[
x26
],
x12
fmla
v0
.4
s
,
v28
.4
s
,
v16
.
s
[
0
]
fmla
v1
.4
s
,
v28
.4
s
,
v17
.
s
[
0
]
fmla
v0
.4
s
,
v29
.4
s
,
v16
.
s
[
1
]
fmla
v1
.4
s
,
v29
.4
s
,
v17
.
s
[
1
]
fmla
v0
.4
s
,
v30
.4
s
,
v16
.
s
[
2
]
fmla
v1
.4
s
,
v30
.4
s
,
v17
.
s
[
2
]
fmla
v0
.4
s
,
v31
.4
s
,
v16
.
s
[
3
]
fmla
v1
.4
s
,
v31
.4
s
,
v17
.
s
[
3
]
ld1
{
v20
.4
s
},
[
x26
],
x12
ld1
{
v21
.4
s
},
[
x26
],
x12
fmla
v2
.4
s
,
v28
.4
s
,
v18
.
s
[
0
]
fmla
v3
.4
s
,
v28
.4
s
,
v19
.
s
[
0
]
fmla
v2
.4
s
,
v29
.4
s
,
v18
.
s
[
1
]
fmla
v3
.4
s
,
v29
.4
s
,
v19
.
s
[
1
]
fmla
v2
.4
s
,
v30
.4
s
,
v18
.
s
[
2
]
fmla
v3
.4
s
,
v30
.4
s
,
v19
.
s
[
2
]
fmla
v2
.4
s
,
v31
.4
s
,
v18
.
s
[
3
]
fmla
v3
.4
s
,
v31
.4
s
,
v19
.
s
[
3
]
ld1
{
v22
.4
s
},
[
x26
],
x12
ld1
{
v23
.4
s
},
[
x26
],
x12
fmla
v4
.4
s
,
v28
.4
s
,
v20
.
s
[
0
]
fmla
v5
.4
s
,
v28
.4
s
,
v21
.
s
[
0
]
fmla
v4
.4
s
,
v29
.4
s
,
v20
.
s
[
1
]
fmla
v5
.4
s
,
v29
.4
s
,
v21
.
s
[
1
]
fmla
v4
.4
s
,
v30
.4
s
,
v20
.
s
[
2
]
fmla
v5
.4
s
,
v30
.4
s
,
v21
.
s
[
2
]
fmla
v4
.4
s
,
v31
.4
s
,
v20
.
s
[
3
]
fmla
v5
.4
s
,
v31
.4
s
,
v21
.
s
[
3
]
fmla
v6
.4
s
,
v28
.4
s
,
v22
.
s
[
0
]
fmla
v7
.4
s
,
v28
.4
s
,
v23
.
s
[
0
]
fmla
v6
.4
s
,
v29
.4
s
,
v22
.
s
[
1
]
fmla
v7
.4
s
,
v29
.4
s
,
v23
.
s
[
1
]
fmla
v6
.4
s
,
v30
.4
s
,
v22
.
s
[
2
]
fmla
v7
.4
s
,
v30
.4
s
,
v23
.
s
[
2
]
fmla
v6
.4
s
,
v31
.4
s
,
v22
.
s
[
3
]
fmla
v7
.4
s
,
v31
.4
s
,
v23
.
s
[
3
]
add
x21
,
x21
,
#
16
add
x25
,
x25
,
#
16
subs
x27
,
x27
,
#
1
bgt
LoopIc8
subs
x23
,
x23
,
#
1
add
x24
,
x24
,
x14
bne
LoopKw8
add
x20
,
x20
,
x13
subs
x22
,
x22
,
#
1
bne
LoopKh8
ldr
x16
,
[
sp
,
#
64
]
cbnz
x16
,
Relu68
ldr
x26
,
[
sp
,
#
56
]
cbnz
x26
,
Relu8
b
Write8
Relu68
:
fmin
v0
.4
s
,
v0
.4
s
,
v26
.4
s
fmin
v1
.4
s
,
v1
.4
s
,
v26
.4
s
fmin
v2
.4
s
,
v2
.4
s
,
v26
.4
s
fmin
v3
.4
s
,
v3
.4
s
,
v26
.4
s
fmin
v4
.4
s
,
v4
.4
s
,
v26
.4
s
fmin
v5
.4
s
,
v5
.4
s
,
v26
.4
s
fmin
v6
.4
s
,
v6
.4
s
,
v26
.4
s
fmin
v7
.4
s
,
v7
.4
s
,
v26
.4
s
Relu8
:
fmax
v0
.4
s
,
v0
.4
s
,
v27
.4
s
fmax
v1
.4
s
,
v1
.4
s
,
v27
.4
s
fmax
v2
.4
s
,
v2
.4
s
,
v27
.4
s
fmax
v3
.4
s
,
v3
.4
s
,
v27
.4
s
fmax
v4
.4
s
,
v4
.4
s
,
v27
.4
s
fmax
v5
.4
s
,
v5
.4
s
,
v27
.4
s
fmax
v6
.4
s
,
v6
.4
s
,
v27
.4
s
fmax
v7
.4
s
,
v7
.4
s
,
v27
.4
s
Write8
:
st1
{
v0
.4
s
},
[
x3
],
x9
st1
{
v1
.4
s
},
[
x3
],
x9
st1
{
v2
.4
s
},
[
x3
],
x9
st1
{
v3
.4
s
},
[
x3
],
x9
st1
{
v4
.4
s
},
[
x3
],
x9
st1
{
v5
.4
s
},
[
x3
],
x9
st1
{
v6
.4
s
},
[
x3
],
x9
st1
{
v7
.4
s
},
[
x3
],
x9
add
x17
,
x17
,
x19
sub
x18
,
x18
,
#
8
cmp
x18
,
#
0
ble
LoopWEnd
cmp
x18
,
#
8
bge
LoopW8
LoopW
:
mov
x20
,
x17
mov
x21
,
x2
mov
x22
,
x6
mov
v0
.16
b
,
v25
.16
b
LoopKh
:
mov
x23
,
x7
mov
x24
,
x20
LoopKw
:
mov
x25
,
x24
mov
x27
,
x10
LoopIc
:
mov
x26
,
x25
mov
x16
,
x21
ld1
{
v28
.4
s
},
[
x16
],
x15
ld1
{
v29
.4
s
},
[
x16
],
x15
ld1
{
v30
.4
s
},
[
x16
],
x15
ld1
{
v31
.4
s
},
[
x16
],
x15
zip1
v20
.4
s
,
v28
.4
s
,
v29
.4
s
zip2
v21
.4
s
,
v28
.4
s
,
v29
.4
s
zip1
v22
.4
s
,
v30
.4
s
,
v31
.4
s
zip2
v23
.4
s
,
v30
.4
s
,
v31
.4
s
ld1
{
v16
.4
s
},
[
x26
],
x12
trn1
v28
.2
d
,
v20
.2
d
,
v22
.2
d
trn2
v29
.2
d
,
v20
.2
d
,
v22
.2
d
trn1
v30
.2
d
,
v21
.2
d
,
v23
.2
d
trn2
v31
.2
d
,
v21
.2
d
,
v23
.2
d
fmla
v0
.4
s
,
v28
.4
s
,
v16
.
s
[
0
]
fmla
v0
.4
s
,
v29
.4
s
,
v16
.
s
[
1
]
fmla
v0
.4
s
,
v30
.4
s
,
v16
.
s
[
2
]
fmla
v0
.4
s
,
v31
.4
s
,
v16
.
s
[
3
]
add
x21
,
x21
,
#
16
add
x25
,
x25
,
#
16
subs
x27
,
x27
,
#
1
bgt
LoopIc
subs
x23
,
x23
,
#
1
add
x24
,
x24
,
x14
bne
LoopKw
add
x20
,
x20
,
x13
subs
x22
,
x22
,
#
1
bne
LoopKh
ldr
x16
,
[
sp
,
#
64
]
cbnz
x16
,
Relu6
ldr
x26
,
[
sp
,
#
56
]
cbnz
x26
,
Relu
b
Write
Relu6
:
fmin
v0
.4
s
,
v0
.4
s
,
v26
.4
s
Relu
:
fmax
v0
.4
s
,
v0
.4
s
,
v27
.4
s
Write
:
st1
{
v0
.4
s
},
[
x3
],
x9
add
x17
,
x17
,
x12
subs
x18
,
x18
,
#
1
bne
LoopW
LoopWEnd
:
add
x0
,
x0
,
x8
add
x1
,
x1
,
x11
subs
x4
,
x4
,
#
1
bne
LoopH
sub
sp
,
sp
,
#
208
ld1
{
v8
.4
s
,
v9
.4
s
,
v10
.4
s
,
v11
.4
s
},
[
sp
],
#
64
ld1
{
v12
.4
s
,
v13
.4
s
,
v14
.4
s
,
v15
.4
s
},
[
sp
],
#
64
ldp
x19
,
x20
,
[
sp
],
#
16
ldp
x21
,
x22
,
[
sp
],
#
16
ldp
x23
,
x24
,
[
sp
],
#
16
ldp
x25
,
x26
,
[
sp
],
#
16
ldp
x27
,
x28
,
[
sp
],
#
16
ret
#endif
mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/common_func.h
浏览文件 @
0ec5a570
...
...
@@ -71,6 +71,11 @@ void DeconvDwFp32Border(float *dst, const float *src, const float *weight, size_
void
PostFuncBiasReluC8
(
float
*
dst
,
const
float
*
src
,
const
float
*
bias
,
size_t
oc8div
,
size_t
oc8mod
,
size_t
plane_size
,
size_t
stride
,
size_t
relu_type
);
void
ConvSwFp32Center
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
const
float
*
bias
,
size_t
height
,
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
ic4
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
relu
,
size_t
relu6
);
#endif
#ifdef __cplusplus
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv.c
浏览文件 @
0ec5a570
...
...
@@ -16,6 +16,7 @@
#include "nnacl/fp32/conv.h"
#include <string.h>
#include "nnacl/fp32/common_func.h"
#include "nnacl/winograd_transform.h"
void
SWBorderPixel
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
const
float
*
bias
,
int
height
,
int
width
,
...
...
@@ -83,6 +84,7 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
}
// height loop
}
#ifndef ENABLE_ARM64
void
SWCenter
(
float
*
dst
,
const
float
*
src
,
const
float
*
weight
,
const
float
*
bias
,
int
height
,
int
width
,
int
kernel_h
,
int
kernel_w
,
int
out_h_step
,
int
block_channel
,
int
ic4
,
int
in_sh_step
,
int
in_sw_step
,
int
in_kh_step
,
int
in_kw_step
,
bool
is_relu
,
bool
is_relu6
)
{
...
...
@@ -135,6 +137,7 @@ void SWCenter(float *dst, const float *src, const float *weight, const float *bi
src_h
+=
in_sh_step
;
}
// dst_height loop
}
#endif
// fp32 sliding window
void
ConvSWFp32
(
const
float
*
input_data
,
const
float
*
packed_weight
,
const
float
*
bias_data
,
float
*
tmp_out_block
,
...
...
@@ -172,11 +175,23 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
src_data
+
in_h_start
*
slidingWindow_param
->
in_h_step_
+
in_w_start
*
slidingWindow_param
->
ic4_channel_
;
float
*
out_t
=
dst_data
+
slidingWindow_param
->
top_
*
slidingWindow_param
->
out_h_step_
+
slidingWindow_param
->
left_
*
slidingWindow_param
->
block_channel_
;
#ifdef ENABLE_ARM64
ConvSwFp32Center
(
out_t
,
in_t
,
weight
,
bias
,
slidingWindow_param
->
bottom_
-
slidingWindow_param
->
top_
,
slidingWindow_param
->
right_
-
slidingWindow_param
->
left_
,
conv_param
->
kernel_h_
,
conv_param
->
kernel_w_
,
slidingWindow_param
->
out_h_step_
*
sizeof
(
float
),
slidingWindow_param
->
block_channel_
*
sizeof
(
float
),
ic4
,
slidingWindow_param
->
in_sh_step_
*
sizeof
(
float
),
slidingWindow_param
->
in_sw_step_
*
sizeof
(
float
),
slidingWindow_param
->
in_kh_step_
*
sizeof
(
float
),
slidingWindow_param
->
in_kw_step_
*
sizeof
(
float
),
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
#else
SWCenter
(
out_t
,
in_t
,
weight
,
bias
,
slidingWindow_param
->
bottom_
-
slidingWindow_param
->
top_
,
slidingWindow_param
->
right_
-
slidingWindow_param
->
left_
,
conv_param
->
kernel_h_
,
conv_param
->
kernel_w_
,
slidingWindow_param
->
out_h_step_
,
slidingWindow_param
->
block_channel_
,
ic4
,
slidingWindow_param
->
right_
-
slidingWindow_param
->
left_
,
conv_param
->
kernel_h_
,
conv_param
->
kernel_w_
,
slidingWindow_param
->
out_h_step_
,
slidingWindow_param
->
block_channel_
,
ic4
,
slidingWindow_param
->
in_sh_step_
,
slidingWindow_param
->
in_sw_step_
,
slidingWindow_param
->
in_kh_step_
,
slidingWindow_param
->
in_kw_step_
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
#endif
}
}
// output C4 loop
src
+=
slidingWindow_param
->
in_step_
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录