Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
正统之独孤求败
mindspore
提交
5802b1f8
M
mindspore
项目概览
正统之独孤求败
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
5802b1f8
编写于
8月 26, 2020
作者:
Y
yangruoqi713
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[MS][LITE] arm cpu fp16 op: optimize conv depthwise
上级
8b5c3521
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
496 addition
and
140 deletion
+496
-140
mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S
mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S
+117
-0
mindspore/lite/nnacl/fp16/common_func.h
mindspore/lite/nnacl/fp16/common_func.h
+0
-49
mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
+56
-2
mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h
mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h
+20
-0
mindspore/lite/nnacl/fp16/pack_fp16.c
mindspore/lite/nnacl/fp16/pack_fp16.c
+13
-0
mindspore/lite/nnacl/fp16/pack_fp16.h
mindspore/lite/nnacl/fp16/pack_fp16.h
+2
-0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
...src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+21
-78
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
.../src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+2
-10
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
...kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+203
-0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
.../kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
+62
-0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
.../src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+0
-1
未找到文件。
mindspore/lite/nnacl/assembly/opt/ConvDwFp16Row.S
0 → 100644
浏览文件 @
5802b1f8
#ifdef __aarch64__
.
text
.
align
5
.
global
ConvDwFp16Row
#ifndef __APPLE__
.
type
ConvDwFp16Row
,
%
function
#endif
//
void
ConvDwFp16Row
(
float16_t
*
output_ptr
,
const
float16_t
*
input_ptr
,
const
float16_t
*
filter_ptr
,
//
size_t
num_pixels
,
size_t
input_channel
,
size_t
input_step
)
//
x0
:
output_ptr
,
x1
:
input_ptr
,
x2
:
filter_ptr
,
x3
:
num_pixels
,
//
x4
:
input_channel
,
x5
:
input_step
//
ConvDwFp16Row
:
//
registers
v8
~
v15
must
be
preserved
by
a
callee
across
subroutine
calls
,
according
to
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
cmp
x3
,
#
0
beq
End
mov
x9
,
x0
mov
x12
,
#
2
//
sizeof
(
float16_t
)
mul
x5
,
x5
,
x12
LoopOutPixel
:
mov
x6
,
x1
mov
x7
,
x2
mov
x8
,
x4
LoopInputDepth32In
:
cmp
x8
,
#
32
blt
Loop8
sub
x8
,
x8
,
#
32
ld1
{
v0
.8
h
,
v1
.8
h
},
[
x6
],
#
32
ld1
{
v2
.8
h
,
v3
.8
h
},
[
x7
],
#
32
ld1
{
v16
.8
h
,
v17
.8
h
},
[
x0
],
#
32
cmp
x8
,
#
32
blt
LoopInputDepth32Out
LoopInputDepth32
:
fmla
v16
.8
h
,
v0
.8
h
,
v2
.8
h
fmla
v17
.8
h
,
v1
.8
h
,
v3
.8
h
st1
{
v16
.8
h
,
v17
.8
h
},
[
x9
],
#
32
ld1
{
v4
.8
h
,
v5
.8
h
},
[
x6
],
#
32
ld1
{
v6
.8
h
,
v7
.8
h
},
[
x7
],
#
32
ld1
{
v18
.8
h
,
v19
.8
h
},
[
x0
],
#
32
fmla
v18
.8
h
,
v4
.8
h
,
v6
.8
h
fmla
v19
.8
h
,
v5
.8
h
,
v7
.8
h
st1
{
v18
.8
h
,
v19
.8
h
},
[
x9
],
#
32
ld1
{
v0
.8
h
,
v1
.8
h
},
[
x6
],
#
32
ld1
{
v2
.8
h
,
v3
.8
h
},
[
x7
],
#
32
ld1
{
v16
.8
h
,
v17
.8
h
},
[
x0
],
#
32
sub
x8
,
x8
,
#
32
cmp
x8
,
#
32
bge
LoopInputDepth32
LoopInputDepth32Out
:
fmla
v16
.8
h
,
v0
.8
h
,
v2
.8
h
fmla
v17
.8
h
,
v1
.8
h
,
v3
.8
h
st1
{
v16
.8
h
,
v17
.8
h
},
[
x9
],
#
32
ld1
{
v4
.8
h
,
v5
.8
h
},
[
x6
],
#
32
ld1
{
v6
.8
h
,
v7
.8
h
},
[
x7
],
#
32
ld1
{
v18
.8
h
,
v19
.8
h
},
[
x0
],
#
32
fmla
v18
.8
h
,
v4
.8
h
,
v6
.8
h
fmla
v19
.8
h
,
v5
.8
h
,
v7
.8
h
st1
{
v18
.8
h
,
v19
.8
h
},
[
x9
],
#
32
Loop8
:
cmp
x8
,
#
8
blt
L0
LoopInputDepth8
:
ld1
{
v0
.8
h
},
[
x6
],
#
16
ld1
{
v2
.8
h
},
[
x7
],
#
16
ld1
{
v16
.8
h
},
[
x0
],
#
16
fmla
v16
.8
h
,
v0
.8
h
,
v2
.8
h
st1
{
v16
.8
h
},
[
x9
],
#
16
sub
x8
,
x8
,
#
8
cmp
x8
,
#
8
bge
LoopInputDepth8
L0
:
cmp
x8
,
#
0
beq
Loop8LineEnd
LoopInputDepth0
:
ldr
h0
,
[
x6
],
#
2
ldr
h1
,
[
x7
],
#
2
ldr
h2
,
[
x0
],
#
2
fmul
h0
,
h0
,
h1
fadd
h2
,
h2
,
h0
str
h2
,
[
x9
],
#
2
subs
x8
,
x8
,
#
1
bne
LoopInputDepth0
Loop8LineEnd
:
subs
x3
,
x3
,
#
1
add
x1
,
x1
,
x5
bne
LoopOutPixel
End
:
ret
#endif
mindspore/lite/nnacl/fp16/common_func.h
已删除
100644 → 0
浏览文件 @
8b5c3521
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_
#define MINDSPORE_LITE_NNACL_FP16_COMMON_FUNC_H_
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "nnacl/op_base.h"
#include "nnacl/conv_parameter.h"
#ifdef __cplusplus
extern
"C"
{
#endif
#ifdef ENABLE_ARM64
void
ConvDwFp16Border
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
const
float16_t
*
bias
,
size_t
height
,
size_t
width
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
kernel_w
,
size_t
relu
,
size_t
relu6
);
void
ConvDwFp16Center
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
const
float16_t
*
bias
,
size_t
height
,
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
relu
,
size_t
relu6
);
void
DeconvDwFp16Border
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
size_t
height
,
size_t
width
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
kernel_w
);
void
DeconvDwFp16Center
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
size_t
height
,
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
);
#endif
#ifdef __cplusplus
}
#endif
#endif
/* MINDSPORE_LITE_NNACL_FP32_COMMON_FUNC_H_ */
mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
浏览文件 @
5802b1f8
...
@@ -15,8 +15,62 @@
...
@@ -15,8 +15,62 @@
*/
*/
#include "nnacl/fp16/conv_depthwise_fp16.h"
#include "nnacl/fp16/conv_depthwise_fp16.h"
#include <arm_neon.h>
#include <string.h>
#include "nnacl/fp16/common_func.h"
#include "nnacl/fp16/activation_fp16.h"
void
ConvDwFp16
(
float16_t
*
output_data
,
const
float16_t
*
input_data
,
const
float16_t
*
weight_data
,
const
float16_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
int
task_id
)
{
int
h_step
=
UP_DIV
(
conv_param
->
output_h_
,
conv_param
->
thread_num_
);
int
h_start
=
h_step
*
task_id
;
int
h_end
=
MSMIN
(
h_start
+
h_step
,
conv_param
->
output_h_
);
bool
relu
=
conv_param
->
act_type_
==
ActType_Relu
;
bool
relu6
=
conv_param
->
act_type_
==
ActType_Relu6
;
for
(
int
b
=
0
;
b
<
conv_param
->
output_batch_
;
b
++
)
{
const
float16_t
*
src
=
input_data
+
b
*
conv_param
->
input_h_
*
conv_param
->
input_w_
*
conv_param
->
input_channel_
;
float16_t
*
dst
=
output_data
+
b
*
conv_param
->
output_h_
*
conv_param
->
output_w_
*
conv_param
->
output_channel_
;
for
(
int
oh
=
h_start
;
oh
<
h_end
;
oh
++
)
{
float16_t
*
dst_data
=
dst
+
oh
*
conv_param
->
output_w_
*
conv_param
->
output_channel_
;
int
ih_origin
=
oh
*
conv_param
->
stride_h_
-
conv_param
->
pad_u_
;
int
start_kh
=
MSMAX
(
0
,
UP_DIV
(
-
ih_origin
,
conv_param
->
dilation_h_
));
int
end_kh
=
MSMIN
(
conv_param
->
kernel_h_
,
UP_DIV
(
conv_param
->
input_h_
-
ih_origin
,
conv_param
->
dilation_h_
));
for
(
int
ow
=
0
;
ow
<
conv_param
->
output_w_
;
ow
++
)
{
memcpy
(
dst_data
+
ow
*
conv_param
->
output_channel_
,
bias_data
,
conv_param
->
output_channel_
*
sizeof
(
float16_t
));
}
for
(
int
kh
=
start_kh
;
kh
<
end_kh
;
kh
++
)
{
int
ih
=
ih_origin
+
conv_param
->
dilation_w_
*
kh
;
const
float16_t
*
src_kh
=
src
+
ih
*
conv_param
->
input_w_
*
conv_param
->
input_channel_
;
const
float16_t
*
weight_kh
=
weight_data
+
kh
*
conv_param
->
kernel_w_
*
conv_param
->
output_channel_
;
int
in_sw_step
=
conv_param
->
stride_w_
*
conv_param
->
input_channel_
;
for
(
int
kw
=
0
;
kw
<
conv_param
->
kernel_w_
;
kw
++
)
{
int
out_w_start
=
MSMAX
(
0
,
(
conv_param
->
pad_l_
-
conv_param
->
dilation_w_
*
kw
+
conv_param
->
stride_w_
-
1
)
/
conv_param
->
stride_w_
);
int
out_w_end
=
MSMIN
(
conv_param
->
output_w_
,
(
conv_param
->
input_w_
+
conv_param
->
pad_l_
-
conv_param
->
dilation_w_
*
kw
+
conv_param
->
stride_w_
-
1
)
/
conv_param
->
stride_w_
);
float16_t
*
dst_w
=
dst_data
+
out_w_start
*
conv_param
->
output_channel_
;
int
iw_origin
=
(
out_w_start
*
conv_param
->
stride_w_
)
-
conv_param
->
pad_l_
+
conv_param
->
dilation_w_
*
kw
;
const
float16_t
*
src_kw
=
src_kh
+
iw_origin
*
conv_param
->
input_channel_
;
int
num_pixels
=
out_w_end
-
out_w_start
;
ConvDwFp16Row
(
dst_w
,
src_kw
,
weight_kh
,
num_pixels
,
conv_param
->
output_channel_
,
in_sw_step
);
weight_kh
+=
conv_param
->
output_channel_
;
}
}
if
(
relu
)
{
ReluFp16
(
dst_data
,
dst_data
,
conv_param
->
output_w_
*
conv_param
->
output_channel_
);
}
if
(
relu6
)
{
Relu6Fp16
(
dst_data
,
dst_data
,
conv_param
->
output_w_
*
conv_param
->
output_channel_
);
}
}
}
}
/*conv depthwise fp16 begin*/
/*conv depthwise fp16 begin*/
void
DepthwiseBorderPixelFp16
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
const
float16_t
*
bias
,
void
DepthwiseBorderPixelFp16
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
const
float16_t
*
bias
,
...
...
mindspore/lite/nnacl/fp16/conv_depthwise_fp16.h
浏览文件 @
5802b1f8
...
@@ -23,6 +23,26 @@
...
@@ -23,6 +23,26 @@
#ifdef __cplusplus
#ifdef __cplusplus
extern
"C"
{
extern
"C"
{
#endif
#endif
#ifdef ENABLE_ARM64
void
ConvDwFp16Row
(
float16_t
*
output_ptr
,
const
float16_t
*
input_ptr
,
const
float16_t
*
filter_ptr
,
size_t
num_pixels
,
size_t
input_channel
,
size_t
input_step
);
void
ConvDwFp16Border
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
const
float16_t
*
bias
,
size_t
height
,
size_t
width
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
kernel_w
,
size_t
relu
,
size_t
relu6
);
void
ConvDwFp16Center
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
const
float16_t
*
bias
,
size_t
height
,
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
relu
,
size_t
relu6
);
void
DeconvDwFp16Border
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
size_t
height
,
size_t
width
,
size_t
in_kh_step
,
size_t
in_kw_step
,
size_t
kernel_w
);
void
DeconvDwFp16Center
(
float16_t
*
dst
,
const
float16_t
*
src
,
const
float16_t
*
weight
,
size_t
height
,
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
);
#endif
void
ConvDwFp16
(
float16_t
*
output_data
,
const
float16_t
*
input_data
,
const
float16_t
*
weight_data
,
const
float16_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
int
task_id
);
void
ConvDwC8Fp16
(
float16_t
*
output_data
,
const
float16_t
*
input_data
,
const
float16_t
*
weight_data
,
void
ConvDwC8Fp16
(
float16_t
*
output_data
,
const
float16_t
*
input_data
,
const
float16_t
*
weight_data
,
const
float16_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
const
float16_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
int
task_id
);
int
task_id
);
...
...
mindspore/lite/nnacl/fp16/pack_fp16.c
浏览文件 @
5802b1f8
...
@@ -220,6 +220,19 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
...
@@ -220,6 +220,19 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
}
}
}
}
void
PackNCHWToNHWCFp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
)
{
for
(
int
n
=
0
;
n
<
batch
;
n
++
)
{
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
for
(
int
hw
=
0
;
hw
<
plane
;
hw
++
)
{
int
nhwc_index
=
n
*
channel
*
plane
+
hw
*
channel
+
c
;
int
nchw_index
=
n
*
channel
*
plane
+
c
*
plane
+
hw
;
((
float16_t
*
)(
dst
))[
nhwc_index
]
=
((
const
float16_t
*
)(
src
))[
nchw_index
];
}
}
}
return
;
}
void
PackNHWCToNHWC4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
)
{
void
PackNHWCToNHWC4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
)
{
int
ic4
=
UP_DIV
(
channel
,
C4NUM
);
int
ic4
=
UP_DIV
(
channel
,
C4NUM
);
int
nhwc4_batch_unit_offset
=
ic4
*
C4NUM
*
plane
;
int
nhwc4_batch_unit_offset
=
ic4
*
C4NUM
*
plane
;
...
...
mindspore/lite/nnacl/fp16/pack_fp16.h
浏览文件 @
5802b1f8
...
@@ -41,6 +41,8 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
...
@@ -41,6 +41,8 @@ void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
void
PackNCHWToNC4HW4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNCHWToNC4HW4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNCHWToNHWCFp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWCToNHWC4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWCToNHWC4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWCToNHWC8Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWCToNHWC8Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
浏览文件 @
5802b1f8
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
*/
*/
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "schema/model_generated.h"
#include "schema/model_generated.h"
...
@@ -30,72 +31,34 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
...
@@ -30,72 +31,34 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace
mindspore
::
kernel
{
namespace
mindspore
::
kernel
{
ConvolutionDepthwiseFp16CPUKernel
::~
ConvolutionDepthwiseFp16CPUKernel
()
{
ConvolutionDepthwiseFp16CPUKernel
::~
ConvolutionDepthwiseFp16CPUKernel
()
{
if
(
sliding_
!=
nullptr
)
{
delete
sliding_
;
sliding_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
if
(
packed_weight_
!=
nullptr
)
{
delete
packed_weight_
;
delete
packed_weight_
;
packed_weight_
=
nullptr
;
packed_weight_
=
nullptr
;
}
}
FreeTmpBuffer
();
}
void
ConvolutionDepthwiseFp16CPUKernel
::
FreeTmpBuffer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
delete
packed_output_
;
packed_output_
=
nullptr
;
}
}
}
int
ConvolutionDepthwiseFp16CPUKernel
::
InitBuffer
()
{
if
(
conv_param_
->
input_channel_
%
C4NUM
!=
0
)
{
need_align_
=
true
;
int
C8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
);
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C8NUM
*
C8
;
packed_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_input_size
*
sizeof
(
float16_t
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C8NUM
*
C8
;
packed_output_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_output_size
*
sizeof
(
float16_t
)));
if
(
packed_output_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
}
return
RET_OK
;
}
}
int
ConvolutionDepthwiseFp16CPUKernel
::
InitWeightBias
()
{
int
ConvolutionDepthwiseFp16CPUKernel
::
InitWeightBias
()
{
// init weight: o, h, w, i; o == group, i == 1
// init weight: o, h, w, i; o == group, i == 1
ConvolutionBaseFP16CPUKernel
::
GetExecuteFilter
();
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
int
OC8
=
UP_DIV
(
weight_tensor
->
Batch
(),
C8NUM
);
int
channel
=
weight_tensor
->
Batch
();
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
int
pack_weight_size
=
channel
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
int
pack_weight_size
=
C8NUM
*
OC8
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
packed_weight_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float16_t
)));
packed_weight_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float16_t
)));
if
(
packed_weight_
==
nullptr
)
{
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
return
RET_ERROR
;
}
}
PackNCHW
Fp32ToNC8HW8Fp16
(
origin_weight
,
packed_weight_
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
PackNCHW
ToNHWCFp16
(
fp16_weight_
,
packed_weight_
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
());
weight_tensor
->
Batch
());
bias_data_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
C8NUM
*
OC8
*
sizeof
(
float16_t
)));
bias_data_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
channel
*
sizeof
(
float16_t
)));
if
(
bias_data_
==
nullptr
)
{
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
return
RET_ERROR
;
}
}
memset
(
bias_data_
,
0
,
C8NUM
*
OC8
*
sizeof
(
float16_t
));
memset
(
bias_data_
,
0
,
channel
*
sizeof
(
float16_t
));
auto
bias_fp16
=
reinterpret_cast
<
float16_t
*>
(
bias_data_
);
auto
bias_fp16
=
reinterpret_cast
<
float16_t
*>
(
bias_data_
);
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
...
@@ -104,18 +67,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
...
@@ -104,18 +67,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
bias_fp16
[
i
]
=
(
float16_t
)
ori_bias
[
i
];
bias_fp16
[
i
]
=
(
float16_t
)
ori_bias
[
i
];
}
}
}
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC8
);
return
RET_OK
;
return
RET_OK
;
}
}
int
ConvolutionDepthwiseFp16CPUKernel
::
Init
()
{
int
ConvolutionDepthwiseFp16CPUKernel
::
Init
()
{
sliding_
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param failed."
;
return
RET_ERROR
;
}
auto
ret
=
InitWeightBias
();
auto
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitWeightBias failed."
;
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitWeightBias failed."
;
...
@@ -129,24 +84,17 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
...
@@ -129,24 +84,17 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
}
}
int
ConvolutionDepthwiseFp16CPUKernel
::
ReSize
()
{
int
ConvolutionDepthwiseFp16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
if
(
ret
!=
RET_OK
)
{
return
ret
;
return
ret
;
}
}
InitSlidingParamConvDw
(
sliding_
,
conv_param_
,
C8NUM
);
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
conv_param_
->
output_h_
);
ret
=
InitBuffer
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitBuffer failed."
;
return
RET_ERROR
;
}
return
RET_OK
;
return
RET_OK
;
}
}
int
ConvolutionDepthwiseFp16CPUKernel
::
Execute
(
int
task_id
)
{
int
ConvolutionDepthwiseFp16CPUKernel
::
Execute
(
int
task_id
)
{
ConvDw
C8Fp16
(
packed_output_
,
packed
_input_
,
packed_weight_
,
reinterpret_cast
<
float16_t
*>
(
bias_data_
),
conv_param_
,
ConvDw
Fp16
(
execute_output_
,
execute
_input_
,
packed_weight_
,
reinterpret_cast
<
float16_t
*>
(
bias_data_
),
conv_param_
,
sliding_
,
task_id
);
task_id
);
return
RET_OK
;
return
RET_OK
;
}
}
...
@@ -176,25 +124,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
...
@@ -176,25 +124,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Get Execute tensor failed."
;
MS_LOG
(
ERROR
)
<<
"Get Execute tensor failed."
;
return
ret
;
return
ret
;
}
}
if
(
need_align_
)
{
PackNHWCToNHWC8Fp16
(
execute_input_
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
}
else
{
packed_input_
=
execute_input_
;
}
if
(
!
need_align_
)
{
packed_output_
=
execute_output_
;
}
ret
=
LiteBackendParallelLaunch
(
ConvDwFp16Run
,
this
,
conv_param_
->
thread_num_
);
ret
=
LiteBackendParallelLaunch
(
ConvDwFp16Run
,
this
,
conv_param_
->
thread_num_
);
if
(
ret
!=
RET_OK
)
{
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvDwFp16Run error: error_code["
<<
ret
<<
"]"
;
MS_LOG
(
ERROR
)
<<
"ConvDwFp16Run error: error_code["
<<
ret
<<
"]"
;
return
RET_ERROR
;
return
RET_ERROR
;
}
}
if
(
need_align_
)
{
PackNHWC8ToNHWCFp16
(
packed_output_
,
execute_output_
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
}
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
return
RET_OK
;
return
RET_OK
;
...
@@ -207,7 +143,14 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::tensor::T
...
@@ -207,7 +143,14 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::tensor::T
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
MS_ASSERT
(
opParameter
!=
nullptr
);
MS_ASSERT
(
opParameter
!=
nullptr
);
MS_ASSERT
(
desc
.
type
==
schema
::
PrimitiveType_DepthwiseConv2D
);
MS_ASSERT
(
desc
.
type
==
schema
::
PrimitiveType_DepthwiseConv2D
);
auto
kernel
=
new
(
std
::
nothrow
)
ConvolutionDepthwiseFp16CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
auto
conv_param
=
reinterpret_cast
<
ConvParameter
*>
(
opParameter
);
kernel
::
LiteKernel
*
kernel
;
if
(
conv_param
->
input_channel_
<
32
)
{
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionDepthwiseSWFp16CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
}
else
{
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionDepthwiseFp16CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
}
if
(
kernel
==
nullptr
)
{
if
(
kernel
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"kernel is nullptr."
;
MS_LOG
(
ERROR
)
<<
"kernel is nullptr."
;
return
nullptr
;
return
nullptr
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
浏览文件 @
5802b1f8
...
@@ -25,14 +25,12 @@
...
@@ -25,14 +25,12 @@
#ifdef __cplusplus
#ifdef __cplusplus
extern
"C"
{
extern
"C"
{
#endif
#endif
void
ConvDwC8Fp16
(
float16_t
*
output_data
,
const
float16_t
*
input_data
,
const
float16_t
*
weight_data
,
void
ConvDwFp16
(
float16_t
*
output_data
,
const
float16_t
*
input_data
,
const
float16_t
*
weight_data
,
const
float16_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
const
float16_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
int
task_id
);
int
task_id
);
#ifdef __cplusplus
#ifdef __cplusplus
}
}
#endif
#endif
namespace
mindspore
::
kernel
{
namespace
mindspore
::
kernel
{
class
ConvolutionDepthwiseFp16CPUKernel
:
public
ConvolutionBaseFP16CPUKernel
{
class
ConvolutionDepthwiseFp16CPUKernel
:
public
ConvolutionBaseFP16CPUKernel
{
public:
public:
...
@@ -46,17 +44,11 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
...
@@ -46,17 +44,11 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int
ReSize
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
Run
()
override
;
int
InitBuffer
();
int
InitWeightBias
();
int
InitWeightBias
();
int
Execute
(
int
task_id
);
int
Execute
(
int
task_id
);
private:
private:
void
FreeTmpBuffer
();
SlidingWindowParam
*
sliding_
=
nullptr
;
float16_t
*
packed_weight_
=
nullptr
;
float16_t
*
packed_weight_
=
nullptr
;
float16_t
*
packed_input_
=
nullptr
;
float16_t
*
packed_output_
=
nullptr
;
bool
need_align_
=
false
;
};
};
}
// namespace mindspore::kernel
}
// namespace mindspore::kernel
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
0 → 100644
浏览文件 @
5802b1f8
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kCPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
lite
::
RET_ERROR
;
using
mindspore
::
lite
::
RET_OK
;
using
mindspore
::
schema
::
PrimitiveType_DepthwiseConv2D
;
namespace
mindspore
::
kernel
{
ConvolutionDepthwiseSWFp16CPUKernel
::~
ConvolutionDepthwiseSWFp16CPUKernel
()
{
if
(
sliding_
!=
nullptr
)
{
delete
sliding_
;
sliding_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
delete
packed_weight_
;
packed_weight_
=
nullptr
;
}
FreeTmpBuffer
();
}
void
ConvolutionDepthwiseSWFp16CPUKernel
::
FreeTmpBuffer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
delete
packed_output_
;
packed_output_
=
nullptr
;
}
}
}
int
ConvolutionDepthwiseSWFp16CPUKernel
::
InitBuffer
()
{
if
(
conv_param_
->
input_channel_
%
C4NUM
!=
0
)
{
need_align_
=
true
;
int
C8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
);
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C8NUM
*
C8
;
packed_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_input_size
*
sizeof
(
float16_t
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C8NUM
*
C8
;
packed_output_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_output_size
*
sizeof
(
float16_t
)));
if
(
packed_output_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
}
return
RET_OK
;
}
int
ConvolutionDepthwiseSWFp16CPUKernel
::
InitWeightBias
()
{
// init weight: o, h, w, i; o == group, i == 1
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
int
OC8
=
UP_DIV
(
weight_tensor
->
Batch
(),
C8NUM
);
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
int
pack_weight_size
=
C8NUM
*
OC8
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
packed_weight_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
PackNCHWFp32ToNC8HW8Fp16
(
origin_weight
,
packed_weight_
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
());
bias_data_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
C8NUM
*
OC8
*
sizeof
(
float16_t
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
bias_data_
,
0
,
C8NUM
*
OC8
*
sizeof
(
float16_t
));
auto
bias_fp16
=
reinterpret_cast
<
float16_t
*>
(
bias_data_
);
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
bias_tensor
->
Data
());
for
(
int
i
=
0
;
i
<
bias_tensor
->
ElementsNum
();
i
++
)
{
bias_fp16
[
i
]
=
(
float16_t
)
ori_bias
[
i
];
}
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC8
);
return
RET_OK
;
}
int
ConvolutionDepthwiseSWFp16CPUKernel
::
Init
()
{
sliding_
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param failed."
;
return
RET_ERROR
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitWeightBias failed."
;
return
RET_ERROR
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
ConvolutionDepthwiseSWFp16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
return
ret
;
}
InitSlidingParamConvDw
(
sliding_
,
conv_param_
,
C8NUM
);
ret
=
InitBuffer
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitBuffer failed."
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
ConvolutionDepthwiseSWFp16CPUKernel
::
Execute
(
int
task_id
)
{
ConvDwC8Fp16
(
packed_output_
,
packed_input_
,
packed_weight_
,
reinterpret_cast
<
float16_t
*>
(
bias_data_
),
conv_param_
,
sliding_
,
task_id
);
return
RET_OK
;
}
static
int
ConvDwSWFp16Run
(
int
task_id
,
LiteParallelGroupEnv
*
penv
,
void
*
cdata
)
{
auto
conv_dw_fp16
=
reinterpret_cast
<
ConvolutionDepthwiseSWFp16CPUKernel
*>
(
cdata
);
auto
ret
=
conv_dw_fp16
->
Execute
(
task_id
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionDepthwiseSWFp16Run error task_id["
<<
task_id
<<
"] error_code["
<<
ret
<<
"]"
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
ConvolutionDepthwiseSWFp16CPUKernel
::
Run
()
{
auto
ret
=
Prepare
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Prepare failed."
;
return
RET_ERROR
;
}
if
(
conv_param_
->
input_channel_
!=
conv_param_
->
output_channel_
)
{
MS_LOG
(
ERROR
)
<<
"Only support input channel equals output channel."
;
return
RET_ERROR
;
}
ret
=
ConvolutionBaseFP16CPUKernel
::
GetExecuteTensor
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Get Execute tensor failed."
;
return
ret
;
}
if
(
need_align_
)
{
PackNHWCToNHWC8Fp16
(
execute_input_
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
}
else
{
packed_input_
=
execute_input_
;
}
if
(
!
need_align_
)
{
packed_output_
=
execute_output_
;
}
ret
=
LiteBackendParallelLaunch
(
ConvDwSWFp16Run
,
this
,
conv_param_
->
thread_num_
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvDwSWFp16Run error: error_code["
<<
ret
<<
"]"
;
return
RET_ERROR
;
}
if
(
need_align_
)
{
PackNHWC8ToNHWCFp16
(
packed_output_
,
execute_output_
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
}
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
0 → 100644
浏览文件 @
5802b1f8
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "nnacl/fp16/conv_depthwise_fp16.h"
#ifdef __cplusplus
extern
"C"
{
#endif
void
ConvDwC8Fp16
(
float16_t
*
output_data
,
const
float16_t
*
input_data
,
const
float16_t
*
weight_data
,
const
float16_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
int
task_id
);
#ifdef __cplusplus
}
#endif
namespace
mindspore
::
kernel
{
class
ConvolutionDepthwiseSWFp16CPUKernel
:
public
ConvolutionBaseFP16CPUKernel
{
public:
ConvolutionDepthwiseSWFp16CPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseFP16CPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
ConvolutionDepthwiseSWFp16CPUKernel
()
override
;
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
InitBuffer
();
int
InitWeightBias
();
int
Execute
(
int
task_id
);
private:
void
FreeTmpBuffer
();
SlidingWindowParam
*
sliding_
=
nullptr
;
float16_t
*
packed_weight_
=
nullptr
;
float16_t
*
packed_input_
=
nullptr
;
float16_t
*
packed_output_
=
nullptr
;
bool
need_align_
=
false
;
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DEPTHWISE_SW_FP16_H_
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
浏览文件 @
5802b1f8
...
@@ -17,7 +17,6 @@
...
@@ -17,7 +17,6 @@
#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
#include "src/runtime/kernel/arm/fp16/matrix_fp16.h"
#include "src/runtime/kernel/arm/fp16/matrix_fp16.h"
#include "nnacl/fp16/conv_fp16.h"
#include "nnacl/fp16/conv_fp16.h"
#include "nnacl/fp16/common_func.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/winograd_transform_fp16.h"
#include "nnacl/fp16/winograd_transform_fp16.h"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录