Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
ba588e20
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ba588e20
编写于
8月 21, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 21, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4692 [MS][LITE] optimize arm cpu op: conv depthwise
Merge pull request !4692 from yangruoqi713/lite
上级
82c888f0
2bf61d2d
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
254 addition
and
270 deletion
+254
-270
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
...src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+61
-50
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
.../src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+1
-0
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
...c/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+45
-42
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
...rc/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
+1
-0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
...lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+21
-29
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
.../src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
+40
-36
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
...e/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
+1
-7
mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
...te/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
+23
-27
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
...src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+24
-33
mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
...c/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
+26
-36
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
+9
-9
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
+2
-1
未找到文件。
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
浏览文件 @
ba588e20
...
...
@@ -29,66 +29,67 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_DepthwiseConv2D
;
namespace
mindspore
::
kernel
{
ConvolutionDepthwiseFp16CPUKernel
::~
ConvolutionDepthwiseFp16CPUKernel
()
{
FreeTmpBuffer
();
}
void
ConvolutionDepthwiseFp16CPUKernel
::
FreeTmpBuffer
()
{
ConvolutionDepthwiseFp16CPUKernel
::~
ConvolutionDepthwiseFp16CPUKernel
()
{
if
(
sliding_
!=
nullptr
)
{
delete
sliding_
;
sliding_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
delete
packed_weight_
;
packed_weight_
=
nullptr
;
}
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
delete
packed_output_
;
packed_output_
=
nullptr
;
FreeTmpBuffer
();
}
void
ConvolutionDepthwiseFp16CPUKernel
::
FreeTmpBuffer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
delete
packed_output_
;
packed_output_
=
nullptr
;
}
}
}
int
ConvolutionDepthwiseFp16CPUKernel
::
InitBuffer
()
{
// malloc pack input buffer
int
C8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
)
;
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C8NUM
*
C8
;
packed_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_input_size
*
sizeof
(
float16_t
)))
;
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
pack_input_size
*
sizeof
(
float16_t
));
if
(
conv_param_
->
input_channel_
%
C4NUM
!=
0
)
{
need_align_
=
true
;
int
C8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
)
;
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C8NUM
*
C8
;
packed_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_input_size
*
sizeof
(
float16_t
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
// malloc pack output buffer
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C8NUM
*
C8
;
packed_output_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_output_size
*
sizeof
(
float16_t
)));
if
(
packed_output_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C8NUM
*
C8
;
packed_output_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_output_size
*
sizeof
(
float16_t
)))
;
if
(
packed_output_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
}
return
RET_OK
;
}
int
ConvolutionDepthwiseFp16CPUKernel
::
InitWeightBias
()
{
// init weight: o, h, w, i; o == group, i == 1
int
OC8
=
UP_DIV
(
conv_param_
->
output_channel_
,
C8NUM
);
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
int
OC8
=
UP_DIV
(
weight_tensor
->
Batch
(),
C8NUM
);
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
int
pack_weight_size
=
C8NUM
*
OC8
*
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
int
pack_weight_size
=
C8NUM
*
OC8
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
()
;
packed_weight_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
float16_t
));
PackNCHWFp32ToNC8HW8Fp16
(
origin_weight
,
packed_weight_
,
1
,
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
,
conv_param_
->
output_channel_
);
PackNCHWFp32ToNC8HW8Fp16
(
origin_weight
,
packed_weight_
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
());
// init bias
bias_data_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
C8NUM
*
OC8
*
sizeof
(
float16_t
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
...
...
@@ -97,8 +98,9 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
memset
(
bias_data_
,
0
,
C8NUM
*
OC8
*
sizeof
(
float16_t
));
auto
bias_fp16
=
reinterpret_cast
<
float16_t
*>
(
bias_data_
);
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
for
(
int
i
=
0
;
i
<
conv_param_
->
output_channel_
;
i
++
)
{
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
bias_tensor
->
Data
());
for
(
int
i
=
0
;
i
<
bias_tensor
->
ElementsNum
();
i
++
)
{
bias_fp16
[
i
]
=
(
float16_t
)
ori_bias
[
i
];
}
}
...
...
@@ -108,6 +110,18 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
}
int
ConvolutionDepthwiseFp16CPUKernel
::
Init
()
{
sliding_
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param failed."
;
return
RET_ERROR
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitWeightBias failed."
;
return
RET_ERROR
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
...
...
@@ -116,21 +130,12 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
int
ConvolutionDepthwiseFp16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
// conv base init
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
return
ret
;
}
// init sliding_ window param
sliding_
=
new
SlidingWindowParam
;
InitSlidingParamConvDw
(
sliding_
,
conv_param_
,
C8NUM
);
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitWeightBias failed."
;
return
RET_ERROR
;
}
ret
=
InitBuffer
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp16 InitBuffer failed."
;
...
...
@@ -171,19 +176,25 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Get Execute tensor failed."
;
return
ret
;
}
// pack input: to nhwc8
PackNHWCToNHWC8Fp16
(
execute_input_
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
if
(
need_align_
)
{
PackNHWCToNHWC8Fp16
(
execute_input_
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
}
else
{
packed_input_
=
execute_input_
;
}
if
(
!
need_align_
)
{
packed_output_
=
execute_output_
;
}
ret
=
LiteBackendParallelLaunch
(
ConvDwFp16Run
,
this
,
conv_param_
->
thread_num_
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvDwFp16Run error: error_code["
<<
ret
<<
"]"
;
return
RET_ERROR
;
}
PackNHWC8ToNHWCFp16
(
packed_output_
,
execute_output_
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
if
(
need_align_
)
{
PackNHWC8ToNHWCFp16
(
packed_output_
,
execute_output_
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
}
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
return
RET_OK
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
浏览文件 @
ba588e20
...
...
@@ -56,6 +56,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
float16_t
*
packed_weight_
=
nullptr
;
float16_t
*
packed_input_
=
nullptr
;
float16_t
*
packed_output_
=
nullptr
;
bool
need_align_
=
false
;
};
}
// namespace mindspore::kernel
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
浏览文件 @
ba588e20
...
...
@@ -28,25 +28,28 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_DeDepthwiseConv2D
;
namespace
mindspore
::
kernel
{
DeconvolutionDepthwiseFp16CPUKernel
::~
DeconvolutionDepthwiseFp16CPUKernel
()
{
FreeTmpBuffer
();
}
void
DeconvolutionDepthwiseFp16CPUKernel
::
FreeTmpBuffer
()
{
DeconvolutionDepthwiseFp16CPUKernel
::~
DeconvolutionDepthwiseFp16CPUKernel
()
{
if
(
sliding_
!=
nullptr
)
{
delete
sliding_
;
sliding_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
delete
packed_weight_
;
packed_weight_
=
nullptr
;
}
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
delete
packed_output_
;
packed_output_
=
nullptr
;
FreeTmpBuffer
();
}
void
DeconvolutionDepthwiseFp16CPUKernel
::
FreeTmpBuffer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
delete
packed_output_
;
packed_output_
=
nullptr
;
}
}
}
...
...
@@ -59,14 +62,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
conv_param_
->
output_h_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_H
);
conv_param_
->
output_w_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_W
);
conv_param_
->
output_channel_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_C
);
// init sliding_ window param
InitSlidingParamConvDw
(
sliding_
,
conv_param_
,
C8NUM
);
return
RET_OK
;
}
int
DeconvolutionDepthwiseFp16CPUKernel
::
InitBuffer
()
{
// malloc pack input buffer
int
C8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
);
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C8NUM
*
C8
;
packed_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_input_size
*
sizeof
(
float16_t
)));
...
...
@@ -74,7 +74,6 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
pack_input_size
*
sizeof
(
float16_t
));
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C8NUM
*
C8
;
packed_output_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_output_size
*
sizeof
(
float16_t
)));
...
...
@@ -88,21 +87,19 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
int
DeconvolutionDepthwiseFp16CPUKernel
::
InitWeightBias
()
{
// init weight: o, h, w, i; o == group, i == 1
int
OC8
=
UP_DIV
(
conv_param_
->
output_channel_
,
C8NUM
);
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
int
OC8
=
UP_DIV
(
weight_tensor
->
Batch
(),
C8NUM
);
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
int
pack_weight_size
=
C8NUM
*
OC8
*
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
int
pack_weight_size
=
C8NUM
*
OC8
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
()
;
packed_weight_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
float16_t
));
PackNCHWFp32ToNC8HW8Fp16
(
origin_weight
,
packed_weight_
,
1
,
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
,
conv_param_
->
output_channel_
);
PackNCHWFp32ToNC8HW8Fp16
(
origin_weight
,
packed_weight_
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
());
// init bias
bias_data_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
C8NUM
*
OC8
*
sizeof
(
float16_t
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
...
...
@@ -110,8 +107,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
}
memset
(
bias_data_
,
0
,
C8NUM
*
OC8
*
sizeof
(
float16_t
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
for
(
int
i
=
0
;
i
<
conv_param_
->
output_channel_
;
i
++
)
{
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
bias_tensor
->
Data
());
for
(
int
i
=
0
;
i
<
bias_tensor
->
ElementsNum
();
i
++
)
{
reinterpret_cast
<
float
*>
(
bias_data_
)[
i
]
=
(
float16_t
)
ori_bias
[
i
];
}
}
...
...
@@ -121,6 +119,17 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
}
int
DeconvolutionDepthwiseFp16CPUKernel
::
Init
()
{
sliding_
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new SlidingWindowParam fail!"
;
return
RET_ERROR
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Deconvolution depthwise fp16 InitWeightBias failed."
;
return
RET_ERROR
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
...
...
@@ -129,25 +138,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() {
int
DeconvolutionDepthwiseFp16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
sliding_
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new SlidingWindowParam fail!"
;
return
RET_ERROR
;
}
InitSlideParam
();
// conv base init
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
return
ret
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Deconvolution depthwise fp16 InitWeightBias failed."
;
return
RET_ERROR
;
}
ret
=
InitBuffer
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Deconvolution depthwise fp16 InitBuffer failed."
;
...
...
@@ -188,18 +183,26 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Get Execute tensor failed."
;
return
ret
;
}
// pack input: to nhwc8
PackNHWCToNHWC8Fp16
(
execute_input_
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
if
(
need_align_
)
{
PackNHWCToNHWC8Fp16
(
execute_input_
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
}
else
{
packed_input_
=
execute_input_
;
}
if
(
!
need_align_
)
{
packed_output_
=
execute_output_
;
}
ret
=
LiteBackendParallelLaunch
(
DeconvDwFp16Run
,
this
,
conv_param_
->
thread_num_
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"DeconvDwFp16Run error: error_code["
<<
ret
<<
"]"
;
return
RET_ERROR
;
}
PackNHWC8ToNHWCFp16
(
packed_output_
,
execute_output_
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
if
(
need_align_
)
{
PackNHWC8ToNHWCFp16
(
packed_output_
,
execute_output_
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
}
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
return
RET_OK
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
浏览文件 @
ba588e20
...
...
@@ -57,6 +57,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel
float16_t
*
packed_weight_
=
nullptr
;
float16_t
*
packed_input_
=
nullptr
;
float16_t
*
packed_output_
=
nullptr
;
bool
need_align_
=
false
;
};
}
// namespace mindspore::kernel
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
浏览文件 @
ba588e20
...
...
@@ -29,18 +29,19 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_DepthwiseConv2D
;
namespace
mindspore
::
kernel
{
ConvolutionDepthwiseCPUKernel
::~
ConvolutionDepthwiseCPUKernel
()
{
FreeTmpBuffer
();
}
void
ConvolutionDepthwiseCPUKernel
::
FreeTmpBuffer
()
{
ConvolutionDepthwiseCPUKernel
::~
ConvolutionDepthwiseCPUKernel
()
{
if
(
sliding_
!=
nullptr
)
{
delete
sliding_
;
sliding_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
delete
packed_weight_
;
packed_weight_
=
nullptr
;
}
FreeTmpBuffer
();
}
void
ConvolutionDepthwiseCPUKernel
::
FreeTmpBuffer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
...
...
@@ -57,19 +58,17 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
int
OC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
int
OC4
=
UP_DIV
(
weight_tensor
->
Batch
()
,
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
()
;
packed_weight_
=
reinterpret_cast
<
float
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
float
));
PackNCHWToNC4HW4Fp32
(
origin_weight
,
packed_weight_
,
1
,
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
,
conv_param_
->
output_channel_
);
PackNCHWToNC4HW4Fp32
(
origin_weight
,
packed_weight_
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
());
// init bias
bias_data_
=
reinterpret_cast
<
float
*>
(
malloc
(
C4NUM
*
OC4
*
sizeof
(
float
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
...
...
@@ -78,16 +77,14 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
memset
(
bias_data_
,
0
,
C4NUM
*
OC4
*
sizeof
(
float
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
conv_param_
->
output_channel_
*
sizeof
(
float
));
memcpy
(
bias_data_
,
ori_bias
,
in_tensors_
.
at
(
kBiasIndex
)
->
ElementsNum
()
*
sizeof
(
float
));
}
// init threadNum;
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC4
);
return
RET_OK
;
}
int
ConvolutionDepthwiseCPUKernel
::
InitBuffer
()
{
// malloc pack input and output buffer
if
(
conv_param_
->
input_channel_
%
C4NUM
!=
0
)
{
need_align_
=
true
;
int
IC4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
...
...
@@ -97,7 +94,6 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() {
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
pack_input_size
*
sizeof
(
float
));
int
OC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C4NUM
*
OC4
;
...
...
@@ -111,32 +107,29 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() {
}
int
ConvolutionDepthwiseCPUKernel
::
Init
()
{
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
ConvolutionDepthwiseCPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
// conv base init
ConvolutionBaseCPUKernel
::
Init
();
// init sliding window param
sliding_
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param failed."
;
return
RET_ERROR
;
}
InitSlidingParamConvDw
(
sliding_
,
conv_param_
,
C4NUM
);
auto
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp32 InitWeightBias failed."
;
return
RET_ERROR
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
ConvolutionDepthwiseCPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
ConvolutionBaseCPUKernel
::
Init
();
InitSlidingParamConvDw
(
sliding_
,
conv_param_
,
C4NUM
);
ret
=
InitBuffer
();
auto
ret
=
InitBuffer
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Convolution depthwise fp32 InitBuffer failed."
;
return
RET_ERROR
;
...
...
@@ -173,7 +166,6 @@ int ConvolutionDepthwiseCPUKernel::Run() {
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
input_addr
=
reinterpret_cast
<
float
*>
(
input_tensor
->
Data
());
// pack input: to nhwc4
if
(
need_align_
)
{
PackNHWCToNHWC4Fp32
(
input_addr
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.cc
浏览文件 @
ba588e20
...
...
@@ -27,12 +27,41 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_DepthwiseConv2D
;
namespace
mindspore
::
kernel
{
ConvolutionDepthwise3x3CPUKernel
::~
ConvolutionDepthwise3x3CPUKernel
()
{
FreeTmpBufer
();
if
(
block_buffer_
!=
nullptr
)
{
free
(
block_buffer_
);
block_buffer_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
}
void
ConvolutionDepthwise3x3CPUKernel
::
FreeTmpBufer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
free
(
packed_output_
);
packed_output_
=
nullptr
;
}
}
if
(
trans_buffer_
!=
nullptr
)
{
free
(
trans_buffer_
);
trans_buffer_
=
nullptr
;
}
}
int
ConvolutionDepthwise3x3CPUKernel
::
InitWeightBias
()
{
// init weight: o, h, w, i; o == group, i == 1
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
// o h w 1 -> o/4 h w 1 4
int
OC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
OC4
=
UP_DIV
(
weight_tensor
->
Batch
()
,
C4NUM
);
int
weight_c4_size
=
OC4
*
C4NUM
*
9
;
auto
tmp_weight
=
reinterpret_cast
<
float
*>
(
malloc
(
weight_c4_size
*
sizeof
(
float
)));
if
(
tmp_weight
==
nullptr
)
{
...
...
@@ -40,8 +69,8 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
return
RET_ERROR
;
}
memset
(
tmp_weight
,
0
,
weight_c4_size
*
sizeof
(
float
));
PackNCHWToNC4HW4Fp32
(
origin_weight
,
tmp_weight
,
1
,
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
,
conv_param_
->
output_channel_
);
PackNCHWToNC4HW4Fp32
(
origin_weight
,
tmp_weight
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
()
,
weight_tensor
->
Batch
()
);
// weight transform
int
packed_weight_size
=
OC4
*
C4NUM
*
16
;
...
...
@@ -62,8 +91,9 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
memset
(
bias_data_
,
0
,
C4NUM
*
OC4
*
sizeof
(
float
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
conv_param_
->
output_channel_
*
sizeof
(
float
));
memcpy
(
bias_data_
,
ori_bias
,
in_tensors_
.
at
(
kBiasIndex
)
->
ElementsNum
()
*
sizeof
(
float
));
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC4
);
return
RET_OK
;
}
...
...
@@ -106,48 +136,22 @@ int ConvolutionDepthwise3x3CPUKernel::Init() {
MS_LOG
(
ERROR
)
<<
"malloc block buffer failed."
;
return
RET_ERROR
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise3x3 fp32 initWeightBias error!ret: "
<<
ret
;
return
ret
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
void
ConvolutionDepthwise3x3CPUKernel
::
FreeTmpBufer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
if
(
packed_output_
!=
nullptr
)
{
free
(
packed_output_
);
packed_output_
=
nullptr
;
}
}
if
(
trans_buffer_
!=
nullptr
)
{
free
(
trans_buffer_
);
trans_buffer_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
}
int
ConvolutionDepthwise3x3CPUKernel
::
ReSize
()
{
FreeTmpBufer
();
// conv base init
ConvolutionBaseCPUKernel
::
Init
();
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise3x3 fp32 initWeightBias error!ret: "
<<
ret
;
return
ret
;
}
// init threadNum;
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
));
ret
=
InitBuffer
();
auto
ret
=
InitBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise3x3 fp32 initBuffer error!ret: "
<<
ret
;
return
ret
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3.h
浏览文件 @
ba588e20
...
...
@@ -30,13 +30,7 @@ class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel {
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
ConvolutionDepthwise3x3CPUKernel
()
override
{
FreeTmpBufer
();
if
(
block_buffer_
!=
nullptr
)
{
free
(
block_buffer_
);
block_buffer_
=
nullptr
;
}
};
~
ConvolutionDepthwise3x3CPUKernel
()
override
;
int
Init
()
override
;
int
ReSize
()
override
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
浏览文件 @
ba588e20
...
...
@@ -27,18 +27,19 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_DeDepthwiseConv2D
;
namespace
mindspore
::
kernel
{
DeconvolutionDepthwiseCPUKernel
::~
DeconvolutionDepthwiseCPUKernel
()
{
FreeTmpBuffer
();
}
void
DeconvolutionDepthwiseCPUKernel
::
FreeTmpBuffer
()
{
DeconvolutionDepthwiseCPUKernel
::~
DeconvolutionDepthwiseCPUKernel
()
{
if
(
sliding_
!=
nullptr
)
{
delete
sliding_
;
sliding_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
delete
packed_weight_
;
packed_weight_
=
nullptr
;
}
FreeTmpBuffer
();
}
void
DeconvolutionDepthwiseCPUKernel
::
FreeTmpBuffer
()
{
if
(
need_align_
)
{
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
...
...
@@ -60,9 +61,6 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
conv_param_
->
output_h_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_H
);
conv_param_
->
output_w_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_W
);
conv_param_
->
output_channel_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_C
);
// init sliding window param
sliding_
=
new
SlidingWindowParam
;
InitSlidingParamConvDw
(
sliding_
,
conv_param_
,
C4NUM
);
return
RET_OK
;
}
...
...
@@ -71,19 +69,17 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
int
OC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
int
OC4
=
UP_DIV
(
weight_tensor
->
Batch
()
,
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
()
;
packed_weight_
=
reinterpret_cast
<
float
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
float
));
PackNCHWToNC4HW4Fp32
(
origin_weight
,
packed_weight_
,
1
,
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
,
conv_param_
->
output_channel_
);
PackNCHWToNC4HW4Fp32
(
origin_weight
,
packed_weight_
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
());
// init bias
bias_data_
=
reinterpret_cast
<
float
*>
(
malloc
(
C4NUM
*
OC4
*
sizeof
(
float
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
...
...
@@ -92,16 +88,14 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
memset
(
bias_data_
,
0
,
C4NUM
*
OC4
*
sizeof
(
float
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
conv_param_
->
output_channel_
*
sizeof
(
float
));
memcpy
(
bias_data_
,
ori_bias
,
in_tensors_
.
at
(
kBiasIndex
)
->
ElementsNum
()
*
sizeof
(
float
));
}
// init threadNum;
conv_param_
->
thread_num_
=
MSMIN
(
conv_param_
->
thread_num_
,
OC4
);
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC4
);
return
RET_OK
;
}
int
DeconvolutionDepthwiseCPUKernel
::
InitBuffer
()
{
// malloc pack input and output buffer
if
(
conv_param_
->
input_channel_
%
C4NUM
!=
0
)
{
need_align_
=
true
;
int
IC4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
...
...
@@ -111,7 +105,6 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
pack_input_size
*
sizeof
(
float
));
int
OC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C4NUM
*
OC4
;
...
...
@@ -126,6 +119,17 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
}
int
DeconvolutionDepthwiseCPUKernel
::
Init
()
{
sliding_
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param failed."
;
return
RET_ERROR
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Deconvolution depthwise fp32 InitWeightBias failed.ret: "
<<
ret
;
return
ret
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
...
...
@@ -135,16 +139,9 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
int
DeconvolutionDepthwiseCPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
InitSlideParam
();
// conv base init
ConvolutionBaseCPUKernel
::
Init
();
auto
ret
=
InitWeightBias
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Deconvolution depthwise fp32 InitWeightBias failed.ret: "
<<
ret
;
return
ret
;
}
ret
=
InitBuffer
();
auto
ret
=
InitBuffer
();
if
(
ret
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Deconvolution depthwise fp32 InitBuffer failed.ret: "
<<
ret
;
return
ret
;
...
...
@@ -181,7 +178,6 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
input_addr
=
reinterpret_cast
<
float
*>
(
input_tensor
->
Data
());
// pack input: to nhwc4
if
(
need_align_
)
{
PackNHWCToNHWC4Fp32
(
input_addr
,
packed_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
...
...
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
浏览文件 @
ba588e20
...
...
@@ -29,15 +29,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace
mindspore
::
kernel
{
void
ConvolutionDepthwiseInt8CPUKernel
::
FreeTmpBuffer
()
{
if
(
sliding
!=
nullptr
)
{
delete
sliding
;
sliding
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
...
...
@@ -51,6 +42,14 @@ void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
}
ConvolutionDepthwiseInt8CPUKernel
::~
ConvolutionDepthwiseInt8CPUKernel
()
{
if
(
sliding
!=
nullptr
)
{
delete
sliding
;
sliding
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
FreeTmpBuffer
();
FreeQuantParam
();
}
...
...
@@ -58,18 +57,18 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
int
ConvolutionDepthwiseInt8CPUKernel
::
InitWeightBias
()
{
// init weight, int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto
origin_weight
=
reinterpret_cast
<
int8_t
*>
(
in_tensors_
[
kWeightIndex
]
->
Data
());
int
OC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
origin_weight
=
reinterpret_cast
<
int8_t
*>
(
weight_tensor
->
Data
());
int
OC4
=
UP_DIV
(
weight_tensor
->
Batch
(),
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
packed_weight_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
int16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
int16_t
));
PackDepthwiseInt8Weight
(
origin_weight
,
packed_weight_
,
conv_param_
);
PackDepthwiseInt8Weight
(
origin_weight
,
packed_weight_
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
(),
&
(
conv_param_
->
conv_quant_arg_
)
);
// init bias, add output zp
bias_data_
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
C4NUM
*
OC4
*
sizeof
(
int32_t
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
...
...
@@ -77,18 +76,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
}
memset
(
bias_data_
,
0
,
C4NUM
*
OC4
*
sizeof
(
int32_t
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
conv_param_
->
output_channel_
*
sizeof
(
int32_t
));
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
bias_tensor
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
bias_tensor
->
ElementsNum
()
*
sizeof
(
int32_t
));
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC4
);
return
RET_OK
;
}
int
ConvolutionDepthwiseInt8CPUKernel
::
InitBuffer
()
{
// malloc packed input buffer
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C4NUM
*
UP_DIV
(
conv_param_
->
input_channel_
,
4
);
packed_input_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
pack_input_size
*
sizeof
(
int16_t
)));
memset
(
packed_input_
,
0
,
pack_input_size
*
sizeof
(
int16_t
));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
...
...
@@ -108,6 +108,11 @@ int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
}
int
ConvolutionDepthwiseInt8CPUKernel
::
Init
()
{
sliding
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param."
;
return
RET_ERROR
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
...
...
@@ -116,32 +121,19 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
int
ConvolutionDepthwiseInt8CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
// conv base init
ConvolutionBaseCPUKernel
::
Init
();
// init sliding window param
sliding
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param."
;
return
RET_ERROR
;
}
InitSlidingParamConvDw
(
sliding
,
conv_param_
,
C4NUM
);
// init quant param
auto
ret
=
ConvolutionBaseCPUKernel
::
SetQuantParam
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Set quant param failed."
;
return
ret
;
}
// init weight and bias
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise int8 InitWeightBias error!"
;
return
ret
;
}
ret
=
InitBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise int8 ReSize error!"
;
...
...
@@ -177,7 +169,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
return
RET_ERROR
;
}
// pack input, assume input format: NHWC -> NHWC4
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
input_addr
=
reinterpret_cast
<
int8_t
*>
(
input_tensor
->
Data
());
PackDepthwiseInt8Input
(
input_addr
,
packed_input_
,
conv_param_
);
...
...
mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
浏览文件 @
ba588e20
...
...
@@ -29,11 +29,6 @@ using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
namespace
mindspore
::
kernel
{
DeconvolutionDepthwiseInt8CPUKernel
::~
DeconvolutionDepthwiseInt8CPUKernel
()
{
FreeTmpBuffer
();
FreeQuantParam
();
}
void
DeconvolutionDepthwiseInt8CPUKernel
::
FreeTmpBuffer
()
{
if
(
sliding
!=
nullptr
)
{
delete
sliding
;
sliding
=
nullptr
;
...
...
@@ -42,6 +37,11 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
delete
packed_weight_
;
packed_weight_
=
nullptr
;
}
FreeTmpBuffer
();
FreeQuantParam
();
}
void
DeconvolutionDepthwiseInt8CPUKernel
::
FreeTmpBuffer
()
{
if
(
packed_input_
!=
nullptr
)
{
delete
packed_input_
;
packed_input_
=
nullptr
;
...
...
@@ -61,18 +61,18 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
int
DeconvolutionDepthwiseInt8CPUKernel
::
InitWeightBias
()
{
// init weight: int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto
origin_weight
=
reinterpret_cast
<
int8_t
*>
(
in_tensors_
[
kWeightIndex
]
->
Data
());
int
OC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
origin_weight
=
reinterpret_cast
<
int8_t
*>
(
weight_tensor
->
Data
());
int
OC4
=
UP_DIV
(
weight_tensor
->
Batch
(),
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
packed_weight_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
int16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
int16_t
));
PackDepthwiseInt8Weight
(
origin_weight
,
packed_weight_
,
conv_param_
);
PackDepthwiseInt8Weight
(
origin_weight
,
packed_weight_
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
(),
&
(
conv_param_
->
conv_quant_arg_
)
);
// init bias, add output zp
bias_data_
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
C4NUM
*
OC4
*
sizeof
(
int32_t
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
...
...
@@ -80,9 +80,11 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
}
memset
(
bias_data_
,
0
,
C4NUM
*
OC4
*
sizeof
(
int32_t
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
conv_param_
->
output_channel_
*
sizeof
(
int32_t
));
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
bias_tensor
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
bias_tensor
->
ElementsNum
()
*
sizeof
(
int32_t
));
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC4
);
return
RET_OK
;
}
...
...
@@ -96,7 +98,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
conv_param_
->
output_w_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_W
);
conv_param_
->
output_channel_
=
in_tensors_
.
front
()
->
shape
().
at
(
kNHWC_C
);
// init sliding window param
InitSlidingParamConvDw
(
sliding
,
conv_param_
,
C4NUM
);
sliding
->
in_h_step_
=
conv_param_
->
input_w_
*
C4NUM
;
...
...
@@ -108,11 +109,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
}
int
DeconvolutionDepthwiseInt8CPUKernel
::
InitBuffer
()
{
// malloc packed input buffer
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C4NUM
*
UP_DIV
(
conv_param_
->
input_channel_
,
4
);
packed_input_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
pack_input_size
*
sizeof
(
int16_t
)));
memset
(
packed_input_
,
0
,
pack_input_size
*
sizeof
(
int16_t
));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
...
...
@@ -130,7 +129,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
memset
(
packed_output_
,
0
,
pack_output_size
*
sizeof
(
int8_t
));
}
// malloc tmp buffer for int32 output
output_buffer_
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C4NUM
*
sizeof
(
int32_t
)));
if
(
output_buffer_
==
nullptr
)
{
...
...
@@ -145,41 +143,33 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
}
int
DeconvolutionDepthwiseInt8CPUKernel
::
Init
()
{
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
DeconvolutionDepthwiseInt8CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
sliding
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new SlidingWindowParam fail!"
;
return
RET_ERROR
;
}
InitSlideParam
();
// conv base init
ConvolutionBaseCPUKernel
::
Init
();
// init quant param
auto
ret
=
ConvolutionBaseCPUKernel
::
SetQuantParam
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Set quant param failed."
;
return
ret
;
}
// init weight and bias
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Deconv Depthwise int8 InitWeightBias error!"
;
return
ret
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
DeconvolutionDepthwiseInt8CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
InitSlideParam
();
ConvolutionBaseCPUKernel
::
Init
();
ret
=
InitBuffer
();
auto
ret
=
InitBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Deconv Depthwise int8 InitBuffer error!"
;
return
ret
;
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
浏览文件 @
ba588e20
...
...
@@ -1035,18 +1035,18 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter
}
}
void
PackDepthwiseInt8Weight
(
const
int8_t
*
origin_weight
,
int16_t
*
packed_weight_
,
const
ConvParameter
*
conv_param
)
{
int
weight_zp
=
conv_param
->
conv_quant_arg_
.
filter_quant_args_
[
0
].
zp_
;
int
unit
=
conv_param
->
kernel_h_
*
conv_param
->
kernel_w
_
;
for
(
int
c
=
0
;
c
<
c
onv_param
->
output_channel_
;
c
++
)
{
if
(
conv_param
->
conv_quant_arg_
.
per_channel_
&
FILTER_PER_CHANNEL
)
{
weight_zp
=
conv_param
->
conv_quant_arg_
.
filter_quant_args_
[
c
].
zp_
;
void
PackDepthwiseInt8Weight
(
const
int8_t
*
origin_weight
,
int16_t
*
packed_weight_
,
int
plane
,
int
channel
,
ConvQuantArg
*
quant_qrg
)
{
int
weight_zp
=
quant_qrg
->
filter_quant_args_
[
0
].
zp
_
;
for
(
int
c
=
0
;
c
<
c
hannel
;
c
++
)
{
if
(
quant_qrg
->
per_channel_
&
FILTER_PER_CHANNEL
)
{
weight_zp
=
quant_qrg
->
filter_quant_args_
[
c
].
zp_
;
}
int
c4_block_num
=
c
/
C4NUM
;
int
c4_block_rem
=
c
%
C4NUM
;
const
int8_t
*
src_c
=
origin_weight
+
c
*
unit
;
int16_t
*
dst_c
=
packed_weight_
+
c4_block_num
*
unit
*
C4NUM
;
for
(
int
k
=
0
;
k
<
unit
;
k
++
)
{
const
int8_t
*
src_c
=
origin_weight
+
c
*
plane
;
int16_t
*
dst_c
=
packed_weight_
+
c4_block_num
*
plane
*
C4NUM
;
for
(
int
k
=
0
;
k
<
plane
;
k
++
)
{
const
int8_t
*
src_kernel
=
src_c
+
k
;
int16_t
*
dst_kernel
=
dst_c
+
C4NUM
*
k
+
c4_block_rem
;
*
dst_kernel
=
(
int16_t
)(
src_kernel
[
0
]
-
weight_zp
);
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.h
浏览文件 @
ba588e20
...
...
@@ -100,7 +100,8 @@ void PackNCHWToNHWCInt8(const void *src, void *dst, int batch, int plane, int ch
void
PackDepthwiseInt8Input
(
const
int8_t
*
src
,
int16_t
*
dst
,
const
ConvParameter
*
conv_param
);
void
PackDepthwiseInt8Weight
(
const
int8_t
*
src
,
int16_t
*
dst
,
const
ConvParameter
*
conv_param
);
void
PackDepthwiseInt8Weight
(
const
int8_t
*
origin_weight
,
int16_t
*
packed_weight_
,
int
plane
,
int
channel
,
ConvQuantArg
*
quant_qrg
);
#ifdef __cplusplus
}
#endif
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录