Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
96df6f87
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
96df6f87
编写于
8月 19, 2020
作者:
F
fuzhiye
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1.molloc buffer using memory pool
2.rewrite resize func
上级
0feb98ae
变更
21
隐藏空白更改
内联
并排
Showing
21 changed file
with
707 addition
and
591 deletion
+707
-591
mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
...pore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+12
-0
mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
...spore/lite/src/runtime/kernel/arm/base/convolution_base.h
+1
-0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
.../lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+50
-39
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
...e/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
+16
-16
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
...pore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+7
-2
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
...e/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
+41
-32
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
...re/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
+11
-10
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
.../src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+61
-38
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
...e/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+17
-16
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+58
-52
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
+11
-10
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
...spore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+57
-41
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+12
-13
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
...te/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
+52
-44
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h
...ite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h
+7
-6
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
.../lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+67
-47
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
...e/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+13
-13
mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
.../lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
+75
-59
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
...pore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+116
-128
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
...spore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
+17
-17
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
+6
-8
未找到文件。
mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
浏览文件 @
96df6f87
...
...
@@ -93,6 +93,18 @@ int ConvolutionBaseCPUKernel::Init() {
return
RET_OK
;
}
int
ConvolutionBaseCPUKernel
::
CheckResizeValid
()
{
// ===============check in channel================= //
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
filter_in_channel
=
filter_tensor
->
Channel
();
int
resize_in_channel
=
in_tensors_
.
at
(
kInputIndex
)
->
Channel
();
if
(
filter_in_channel
!=
resize_in_channel
)
{
MS_LOG
(
ERROR
)
<<
"Channel of resized input should be equal to in channel of filter."
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
ConvolutionBaseCPUKernel
::
CheckLayout
(
lite
::
tensor
::
Tensor
*
input_tensor
)
{
auto
data_type
=
input_tensor
->
data_type
();
auto
input_format
=
input_tensor
->
GetFormat
();
...
...
mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
浏览文件 @
96df6f87
...
...
@@ -58,6 +58,7 @@ class ConvolutionBaseCPUKernel : public LiteKernel {
int
SetFilterTensorQuantParam
();
int
SetOutputTensorQuantParam
();
int
SetQuantMultiplier
();
int
CheckResizeValid
();
void
FreeQuantParam
();
protected:
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
浏览文件 @
96df6f87
...
...
@@ -50,11 +50,14 @@ void ProcessFilterFp16(float16_t *origin_weight, float16_t *dst_weight, ConvPara
}
int
Convolution3x3FP16CPUKernel
::
InitWeightBias
()
{
auto
input_channel
=
conv_param_
->
input_channel_
;
int
output_channel
=
conv_param_
->
output_channel_
;
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
input_channel
=
filter_tensor
->
Channel
();
auto
output_channel
=
filter_tensor
->
Batch
();
conv_param_
->
input_channel_
=
input_channel
;
conv_param_
->
output_channel_
=
output_channel
;
int
iC8
=
UP_DIV
(
input_channel
,
C8NUM
);
int
oC8
=
UP_DIV
(
output_channel
,
C8NUM
);
//
init weight
//
===========================init weight========================== //
size_t
transformed_size
=
iC8
*
C8NUM
*
oC8
*
C8NUM
*
36
*
sizeof
(
float16_t
);
transformed_filter_addr_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
transformed_size
));
if
(
transformed_filter_addr_
==
nullptr
)
{
...
...
@@ -69,7 +72,7 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() {
}
ProcessFilterFp16
(
execute_weight_
,
transformed_filter_addr_
,
conv_param_
);
//
init bias
//
=============================init bias========================= //
size_t
new_bias_size
=
oC8
*
C8NUM
*
sizeof
(
float16_t
);
bias_data_
=
malloc
(
new_bias_size
);
if
(
bias_data_
==
nullptr
)
{
...
...
@@ -92,55 +95,32 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() {
int
Convolution3x3FP16CPUKernel
::
InitTmpBuffer
()
{
const
int
tile_num
=
16
;
const
int
k_plane
=
36
;
int
iC8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
);
int
oC8
=
UP_DIV
(
conv_param_
->
output_channel_
,
C8NUM
);
/*=============================tile_buffer_============================*/
size_t
tile_buffer_size
=
thread_count_
*
tile_num
*
k_plane
*
iC8
*
C8NUM
*
sizeof
(
float16_t
);
tile_buffer_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tile_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
tile_buffer_
,
0
,
tile_buffer_size
);
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================block_unit_buffer_============================*/
size_t
block_unit_buffer_size
=
thread_count_
*
k_plane
*
C8NUM
*
sizeof
(
float16_t
);
block_unit_buffer_
=
reinterpret_cast
<
float16_t
*>
(
m
alloc
(
block_unit_buffer_size
));
block_unit_buffer_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
M
alloc
(
block_unit_buffer_size
));
if
(
block_unit_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc block_unit_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
block_unit_buffer_
,
0
,
block_unit_buffer_size
);
/*=============================tmp_dst_buffer_============================*/
size_t
tmp_dst_buffer_size
=
thread_count_
*
tile_num
*
k_plane
*
oC8
*
C8NUM
*
sizeof
(
float16_t
);
tmp_dst_buffer_
=
reinterpret_cast
<
float16_t
*>
(
m
alloc
(
tmp_dst_buffer_size
));
tmp_dst_buffer_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
M
alloc
(
tmp_dst_buffer_size
));
if
(
tmp_dst_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_dst_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_dst_buffer_
,
0
,
tmp_dst_buffer_size
);
/*=============================tmp_out_============================*/
int
new_out_plane
=
UP_DIV
(
conv_param_
->
output_h_
,
C4NUM
)
*
UP_DIV
(
conv_param_
->
output_w_
,
C4NUM
)
*
C4NUM
*
C4NUM
;
size_t
tmp_out_size
=
oC8
*
C8NUM
*
conv_param_
->
output_batch_
*
new_out_plane
*
sizeof
(
float16_t
);
tmp_out_
=
reinterpret_cast
<
float16_t
*>
(
m
alloc
(
tmp_out_size
));
tmp_out_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
M
alloc
(
tmp_out_size
));
if
(
tmp_out_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_out_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_out_
,
0
,
tmp_out_size
);
/*=============================nhwc4_input_============================*/
size_t
nhwc8_input_size
=
iC8
*
C8NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc8_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc8_input_size
);
return
RET_OK
;
}
...
...
@@ -160,12 +140,22 @@ int Convolution3x3FP16CPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
return
ReSize
();
}
int
Convolution3x3FP16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
tile_buffer_
!=
nullptr
)
{
free
(
tile_buffer_
);
tile_buffer_
=
nullptr
;
...
...
@@ -174,21 +164,35 @@ int Convolution3x3FP16CPUKernel::ReSize() {
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
ret
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
const
int
tile_num
=
16
;
const
int
k_plane
=
36
;
int
iC8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
);
/*=============================nhwc4_input_============================*/
size_t
nhwc8_input_size
=
iC8
*
C8NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc8_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
memset
(
nhwc4_input_
,
0
,
nhwc8_input_size
);
/*=============================tile_buffer_============================*/
size_t
tile_buffer_size
=
thread_count_
*
tile_num
*
k_plane
*
iC8
*
C8NUM
*
sizeof
(
float16_t
);
tile_buffer_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tile_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
tile_buffer_
,
0
,
tile_buffer_size
);
return
RET_OK
;
}
...
...
@@ -220,6 +224,11 @@ int Convolution3x3FP16CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Get execute tensor failed."
;
return
ret
;
}
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
...
...
@@ -229,6 +238,7 @@ int Convolution3x3FP16CPUKernel::Run() {
int
error_code
=
LiteBackendParallelLaunch
(
Convolution3x3Fp16Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv3x3 fp16 error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
...
...
@@ -248,6 +258,7 @@ int Convolution3x3FP16CPUKernel::Run() {
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
浏览文件 @
96df6f87
...
...
@@ -30,23 +30,11 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseFP16CPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
Convolution3x3FP16CPUKernel
()
override
{
FreeTmpBuffer
();
}
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
RunImpl
(
int
task_id
);
int
InitWeightBias
();
int
InitTmpBuffer
();
void
ConfigInputOutput
();
private:
void
FreeTmpBuffer
()
{
~
Convolution3x3FP16CPUKernel
()
override
{
if
(
fp16_weight_
!=
nullptr
)
{
free
(
fp16_weight_
);
fp16_weight_
=
nullptr
;
}
if
(
transformed_filter_addr_
!=
nullptr
)
{
free
(
transformed_filter_addr_
);
transformed_filter_addr_
=
nullptr
;
...
...
@@ -55,16 +43,28 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
free
(
tile_buffer_
);
tile_buffer_
=
nullptr
;
}
}
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
RunImpl
(
int
task_id
);
int
InitWeightBias
();
int
InitTmpBuffer
();
void
ConfigInputOutput
();
private:
void
FreeTmpBuffer
()
{
if
(
block_unit_buffer_
!=
nullptr
)
{
f
ree
(
block_unit_buffer_
);
ctx_
->
allocator
->
F
ree
(
block_unit_buffer_
);
block_unit_buffer_
=
nullptr
;
}
if
(
tmp_dst_buffer_
!=
nullptr
)
{
f
ree
(
tmp_dst_buffer_
);
ctx_
->
allocator
->
F
ree
(
tmp_dst_buffer_
);
tmp_dst_buffer_
=
nullptr
;
}
if
(
tmp_out_
!=
nullptr
)
{
f
ree
(
tmp_out_
);
ctx_
->
allocator
->
F
ree
(
tmp_out_
);
tmp_out_
=
nullptr
;
}
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
浏览文件 @
96df6f87
...
...
@@ -143,14 +143,19 @@ int ConvolutionFP16CPUKernel::Init() {
}
int
ConvolutionFP16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init fail!ret: "
<<
ret
;
return
ret
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
浏览文件 @
96df6f87
...
...
@@ -59,16 +59,19 @@ int ConvolutionSWFP16CPUKernel::ProcessFilter() {
}
int
ConvolutionSWFP16CPUKernel
::
InitWeightBias
()
{
int
kernel_h
=
conv_param_
->
kernel_h_
;
int
kernel_w
=
conv_param_
->
kernel_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
int
out_channel
=
conv_param_
->
output_channel_
;
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
int
kernel_h
=
filter_tensor
->
Height
();
int
kernel_w
=
filter_tensor
->
Width
();
int
in_channel
=
filter_tensor
->
Channel
();
int
out_channel
=
filter_tensor
->
Batch
();
conv_param_
->
input_channel_
=
in_channel
;
conv_param_
->
output_channel_
=
out_channel
;
int
oc4
=
UP_DIV
(
out_channel
,
C4NUM
);
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
kernel_plane
=
kernel_h
*
kernel_w
;
int
pack_weight_size
=
oc4
*
ic4
*
C4NUM
*
C4NUM
*
kernel_plane
;
//
init weight
//
========================init weight==================== //
packed_weight_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc packed_weight_ failed."
;
...
...
@@ -81,7 +84,7 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() {
return
ret
;
}
//
init bias
//
=======================init bias====================== //
bias_data_
=
malloc
(
oc4
*
C4NUM
*
sizeof
(
float16_t
));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc bias_data_ failed."
;
...
...
@@ -101,29 +104,16 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() {
}
int
ConvolutionSWFP16CPUKernel
::
InitTmpBuffer
()
{
int
in_channel
=
conv_param_
->
input_channel_
;
int
out_channel
=
conv_param_
->
output_channel_
;
int
channel_block
=
UP_DIV
(
in_channel
,
C4NUM
);
int
oc4
=
UP_DIV
(
out_channel
,
C4NUM
);
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
channel_block
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
/*=============================tmp_output_block_============================*/
tmp_output_block_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
oc4
*
C4NUM
*
sizeof
(
float16_t
)));
tmp_output_block_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
oc4
*
C4NUM
*
sizeof
(
float16_t
)));
if
(
tmp_output_block_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_output_block_ failed."
;
return
RET_ERROR
;
}
return
RET_OK
;
}
...
...
@@ -142,32 +132,44 @@ int ConvolutionSWFP16CPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
ConfigInputOutput
();
return
ReSize
();
}
int
ConvolutionSWFP16CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init fail!ret: "
<<
ret
;
return
ret
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
ret
=
InitTmpBuffer
(
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"
Init tmp buffer
failed."
;
/*=============================nhwc4_input_============================*/
int
ic4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
)
;
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"
malloc nhwc4_input_
failed."
;
return
RET_ERROR
;
}
ConfigInputOutput
(
);
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
// init sliding window param
slidingWindow_param_
=
new
SlidingWindowParam
;
...
...
@@ -202,6 +204,11 @@ int ConvolutionSWFP16CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Get Execute tensor failed."
;
return
ret
;
}
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
...
...
@@ -212,6 +219,7 @@ int ConvolutionSWFP16CPUKernel::Run() {
int
error_code
=
LiteBackendParallelLaunch
(
ConvolutionSWFp16Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv fp16 error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
...
...
@@ -224,6 +232,7 @@ int ConvolutionSWFP16CPUKernel::Run() {
}
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
浏览文件 @
96df6f87
...
...
@@ -28,7 +28,16 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseFP16CPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
ConvolutionSWFP16CPUKernel
()
override
{
FreeTmpBuffer
();
}
~
ConvolutionSWFP16CPUKernel
()
override
{
if
(
fp16_weight_
!=
nullptr
)
{
free
(
fp16_weight_
);
fp16_weight_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
}
int
Init
()
override
;
int
ReSize
()
override
;
...
...
@@ -41,16 +50,8 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
private:
void
FreeTmpBuffer
()
{
if
(
fp16_weight_
!=
nullptr
)
{
free
(
fp16_weight_
);
fp16_weight_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
if
(
tmp_output_block_
!=
nullptr
)
{
f
ree
(
tmp_output_block_
);
ctx_
->
allocator
->
F
ree
(
tmp_output_block_
);
tmp_output_block_
=
nullptr
;
}
if
(
slidingWindow_param_
!=
nullptr
)
{
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
浏览文件 @
96df6f87
...
...
@@ -110,10 +110,15 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei
}
int
ConvolutionWinogradFP16CPUKernel
::
InitWeightBias
()
{
int
output_channel
=
conv_param_
->
output_channel_
;
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
int
in_channel
=
filter_tensor
->
Channel
();
int
out_channel
=
filter_tensor
->
Batch
();
conv_param_
->
input_channel_
=
in_channel
;
conv_param_
->
output_channel_
=
out_channel
;
int
oc_block
,
oc_block_num
;
oc_block
=
C8NUM
;
oc_block_num
=
UP_DIV
(
out
put
_channel
,
C8NUM
);
oc_block_num
=
UP_DIV
(
out_channel
,
C8NUM
);
// init weight
auto
ret
=
MallocFilterMatrix
(
oc_block
,
oc_block_num
);
...
...
@@ -139,7 +144,7 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
auto
fp16_bias_data
=
reinterpret_cast
<
float16_t
*>
(
bias_data_
);
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
for
(
int
i
=
0
;
i
<
out
put
_channel
;
++
i
)
{
for
(
int
i
=
0
;
i
<
out_channel
;
++
i
)
{
fp16_bias_data
[
i
]
=
(
float16_t
)
ori_bias
[
i
];
}
}
else
{
...
...
@@ -188,25 +193,14 @@ int ConvolutionWinogradFP16CPUKernel::MallocFilterMatrix(int oc_block, int oc_bl
int
ConvolutionWinogradFP16CPUKernel
::
InitTmpBuffer
()
{
int
cal_num
=
16
;
int
channel_in
=
conv_param_
->
input_channel_
;
int
channel_out
=
conv_param_
->
output_channel_
;
int
output_h
=
conv_param_
->
output_h_
;
int
output_w
=
conv_param_
->
output_w_
;
int
ic8
=
UP_DIV
(
channel_in
,
C8NUM
);
int
oc8
=
UP_DIV
(
channel_out
,
C8NUM
);
/*=============================trans_input_============================*/
size_t
tile_buffer_size
=
thread_count_
*
cal_num
*
input_unit_
*
input_unit_
*
ic8
*
C8NUM
*
sizeof
(
float16_t
);
trans_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
trans_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc trans_input_ failed."
;
return
RET_ERROR
;
}
memset
(
trans_input_
,
0
,
tile_buffer_size
);
/*=============================gemm_out_============================*/
gemm_out_
=
reinterpret_cast
<
float16_t
*>
(
m
alloc
(
thread_count_
*
cal_num
*
input_unit_
*
input_unit_
*
oc8
*
C8NUM
*
sizeof
(
float16_t
)));
ctx_
->
allocator
->
M
alloc
(
thread_count_
*
cal_num
*
input_unit_
*
input_unit_
*
oc8
*
C8NUM
*
sizeof
(
float16_t
)));
if
(
gemm_out_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc gemm_out_ failed."
;
return
RET_ERROR
;
...
...
@@ -215,36 +209,26 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
/*=============================tmp_out_data_============================*/
int
out_w_block
=
UP_DIV
(
output_w
,
output_unit_
);
int
out_h_block
=
UP_DIV
(
output_h
,
output_unit_
);
tmp_out_data_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
conv_param_
->
output_batch_
*
out_w_block
*
out_h_block
*
output_unit_
*
output_unit_
*
oc8
*
C8NUM
*
sizeof
(
float16_t
)));
tmp_out_data_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
Malloc
(
conv_param_
->
output_batch_
*
out_w_block
*
out_h_block
*
output_unit_
*
output_unit_
*
oc8
*
C8NUM
*
sizeof
(
float16_t
)));
if
(
tmp_out_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_out_data_ failed."
;
return
RET_ERROR
;
}
/*=============================tmp_data_============================*/
tmp_data_
=
reinterpret_cast
<
float16_t
*>
(
m
alloc
(
thread_count_
*
C8NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float16_t
)));
tmp_data_
=
reinterpret_cast
<
float16_t
*>
(
ctx_
->
allocator
->
M
alloc
(
thread_count_
*
C8NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float16_t
)));
if
(
tmp_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_data_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_data_
,
0
,
C8NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float16_t
));
tmp_buffer_address_list_
[
0
]
=
trans_input_
;
tmp_buffer_address_list_
[
1
]
=
gemm_out_
;
tmp_buffer_address_list_
[
2
]
=
tmp_out_data_
;
tmp_buffer_address_list_
[
3
]
=
tmp_data_
;
/*=============================nhwc4_input_============================*/
size_t
nhwc8_input_size
=
ic8
*
C8NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc8_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc8_input_size
);
return
RET_OK
;
}
...
...
@@ -270,17 +254,37 @@ int ConvolutionWinogradFP16CPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
kernel_unit_
=
conv_param_
->
kernel_h_
;
input_unit_
=
output_unit_
+
kernel_unit_
-
1
;
conv_param_
->
input_unit_
=
input_unit_
;
conv_param_
->
output_unit_
=
output_unit_
;
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
return
ReSize
();
}
int
ConvolutionWinogradFP16CPUKernel
::
ReSize
()
{
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
if
(
trans_input_
!=
nullptr
)
{
free
(
trans_input_
);
trans_input_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
RET_ERROR
;
...
...
@@ -290,17 +294,28 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
conv_param_
->
input_unit_
=
input_unit_
;
conv_param_
->
output_unit_
=
output_unit_
;
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
int
cal_num
=
16
;
int
channel_in
=
conv_param_
->
input_channel_
;
int
ic8
=
UP_DIV
(
channel_in
,
C8NUM
);
/*=============================nhwc4_input_============================*/
size_t
nhwc8_input_size
=
ic8
*
C8NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc8_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
// malloc tmp buffer
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
memset
(
nhwc4_input_
,
0
,
nhwc8_input_size
);
/*=============================trans_input_============================*/
size_t
tile_buffer_size
=
thread_count_
*
cal_num
*
input_unit_
*
input_unit_
*
ic8
*
C8NUM
*
sizeof
(
float16_t
);
trans_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
trans_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc trans_input_ failed."
;
return
RET_ERROR
;
}
memset
(
trans_input_
,
0
,
tile_buffer_size
);
ret
=
ConfigInputOutput
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConfigInputOutput failed."
;
...
...
@@ -339,6 +354,12 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
return
ret
;
}
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
...
...
@@ -348,6 +369,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
int
error_code
=
LiteBackendParallelLaunch
(
ConvolutionWinogradFp16Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv winograd error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
...
...
@@ -364,6 +386,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
}
ConvolutionBaseFP16CPUKernel
::
IfCastOutput
();
ConvolutionBaseFP16CPUKernel
::
FreeTmpBuffer
();
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
浏览文件 @
96df6f87
...
...
@@ -33,7 +33,20 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
,
int
out_unit
)
:
ConvolutionBaseFP16CPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
),
output_unit_
(
out_unit
)
{}
~
ConvolutionWinogradFP16CPUKernel
()
override
{
FreeTmpBuffer
();
}
~
ConvolutionWinogradFP16CPUKernel
()
override
{
if
(
fp16_weight_
!=
nullptr
)
{
free
(
fp16_weight_
);
fp16_weight_
=
nullptr
;
}
if
(
trans_input_
!=
nullptr
)
{
free
(
trans_input_
);
trans_input_
=
nullptr
;
}
if
(
trans_weight_
!=
nullptr
)
{
delete
trans_weight_
;
trans_weight_
=
nullptr
;
}
}
int
Init
()
override
;
int
ReSize
()
override
;
...
...
@@ -46,30 +59,18 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
private:
void
FreeTmpBuffer
()
{
if
(
fp16_weight_
!=
nullptr
)
{
free
(
fp16_weight_
);
fp16_weight_
=
nullptr
;
}
if
(
tmp_data_
!=
nullptr
)
{
f
ree
(
tmp_data_
);
ctx_
->
allocator
->
F
ree
(
tmp_data_
);
tmp_data_
=
nullptr
;
}
if
(
trans_input_
!=
nullptr
)
{
free
(
trans_input_
);
trans_input_
=
nullptr
;
}
if
(
gemm_out_
!=
nullptr
)
{
f
ree
(
gemm_out_
);
ctx_
->
allocator
->
F
ree
(
gemm_out_
);
gemm_out_
=
nullptr
;
}
if
(
tmp_out_data_
!=
nullptr
)
{
f
ree
(
tmp_out_data_
);
ctx_
->
allocator
->
F
ree
(
tmp_out_data_
);
tmp_out_data_
=
nullptr
;
}
if
(
trans_weight_
!=
nullptr
)
{
delete
trans_weight_
;
trans_weight_
=
nullptr
;
}
}
int
kernel_unit_
;
int
input_unit_
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
浏览文件 @
96df6f87
...
...
@@ -35,10 +35,13 @@ using mindspore::schema::PrimitiveType_Conv2D;
namespace
mindspore
::
kernel
{
int
ConvolutionCPUKernel
::
InitWeightBias
()
{
int
kernel_h
=
conv_param_
->
kernel_h_
;
int
kernel_w
=
conv_param_
->
kernel_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
int
out_channel
=
conv_param_
->
output_channel_
;
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
int
kernel_h
=
filter_tensor
->
Height
();
int
kernel_w
=
filter_tensor
->
Width
();
int
in_channel
=
filter_tensor
->
Channel
();
int
out_channel
=
filter_tensor
->
Batch
();
conv_param_
->
input_channel_
=
in_channel
;
conv_param_
->
output_channel_
=
out_channel
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
kernel_plane
=
kernel_h
*
kernel_w
;
int
oc_block
,
oc_block_num
;
...
...
@@ -52,7 +55,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
int
pack_weight_size
=
oc_block_num
*
oc_block
*
ic4
*
C4NUM
*
kernel_plane
;
// =====================init weight==========================//
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kWeightIndex
)
->
Data
());
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
filter_tensor
->
Data
());
packed_weight_
=
reinterpret_cast
<
float
*>
(
malloc
(
pack_weight_size
*
sizeof
(
float
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc packed weight failed."
;
...
...
@@ -67,7 +70,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
MS_LOG
(
ERROR
)
<<
"malloc bias failed."
;
return
RET_ERROR
;
}
memset
(
bias_data_
,
0
,
oc_block_num
*
oc_block
*
sizeof
(
float
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
out_channel
*
sizeof
(
float
));
...
...
@@ -78,39 +81,11 @@ int ConvolutionCPUKernel::InitWeightBias() {
}
int
ConvolutionCPUKernel
::
InitTmpBuffer
()
{
int
kernel_h
=
conv_param_
->
kernel_h_
;
int
kernel_w
=
conv_param_
->
kernel_w_
;
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_channel
=
conv_param_
->
input_channel_
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
out_channel
=
conv_param_
->
output_channel_
;
int
kernel_plane
=
kernel_h
*
kernel_w
;
// malloc packed_inputs
int
output_count
=
conv_param_
->
output_h_
*
conv_param_
->
output_w_
;
int
output_tile_count
=
UP_DIV
(
output_count
,
TILE_NUM
);
int
unit_size
=
kernel_plane
*
ic4
*
C4NUM
;
int
packed_input_size
=
output_tile_count
*
TILE_NUM
*
unit_size
;
/*=============================packed_input============================*/
packed_input_
=
reinterpret_cast
<
float
*>
(
malloc
(
in_batch
*
packed_input_size
*
sizeof
(
float
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc packed input failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
in_batch
*
packed_input_size
*
sizeof
(
float
));
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4 input failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================tmp_output_block_============================*/
tmp_output_block_
=
reinterpret_cast
<
float
*>
(
m
alloc
(
TILE_NUM
*
out_channel
*
sizeof
(
float
)));
tmp_output_block_
=
reinterpret_cast
<
float
*>
(
ctx_
->
allocator
->
M
alloc
(
TILE_NUM
*
out_channel
*
sizeof
(
float
)));
if
(
tmp_output_block_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp output block failed."
;
return
RET_ERROR
;
...
...
@@ -134,34 +109,59 @@ int ConvolutionCPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
ConfigInputOutput
();
return
ReSize
();
}
int
ConvolutionCPUKernel
::
ReSize
()
{
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
RET_ERROR
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
/*=============================nhwc4_input_============================*/
int
ic4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4 input failed."
;
return
RET_ERROR
;
}
// init tmp input, output
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
/*=============================packed_input============================*/
int
output_count
=
conv_param_
->
output_h_
*
conv_param_
->
output_w_
;
int
output_tile_count
=
UP_DIV
(
output_count
,
TILE_NUM
);
int
unit_size
=
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
*
ic4
*
C4NUM
;
int
packed_input_size
=
output_tile_count
*
TILE_NUM
*
unit_size
;
packed_input_
=
reinterpret_cast
<
float
*>
(
malloc
(
conv_param_
->
input_batch_
*
packed_input_size
*
sizeof
(
float
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc packed input failed."
;
return
RET_ERROR
;
}
// config input output
ConfigInputOutput
();
memset
(
packed_input_
,
0
,
conv_param_
->
input_batch_
*
packed_input_size
*
sizeof
(
float
));
return
RET_OK
;
}
...
...
@@ -192,19 +192,25 @@ int ConvolutionCPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Prepare fail!ret: "
<<
prepare_ret
;
return
prepare_ret
;
}
// ============Init buffer using memory pool allocator=============//
auto
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
ori_input_data
=
input_tensor
->
Data
();
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
int
error_code
=
LiteBackendParallelLaunch
(
ConvolutionImpl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
FreeTmpBuffer
();
return
RET_OK
;
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
浏览文件 @
96df6f87
...
...
@@ -30,7 +30,16 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
lite
::
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
ConvolutionCPUKernel
()
override
{
FreeTmpBuffer
();
}
~
ConvolutionCPUKernel
()
override
{
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
}
int
Init
()
override
;
int
ReSize
()
override
;
...
...
@@ -42,18 +51,10 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
private:
void
FreeTmpBuffer
()
{
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
if
(
tmp_output_block_
!=
nullptr
)
{
f
ree
(
tmp_output_block_
);
ctx_
->
allocator
->
F
ree
(
tmp_output_block_
);
tmp_output_block_
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
}
float
*
packed_input_
=
nullptr
;
float
*
packed_weight_
=
nullptr
;
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
浏览文件 @
96df6f87
...
...
@@ -49,8 +49,11 @@ void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_
}
int
Convolution3x3CPUKernel
::
InitWeightBias
()
{
auto
input_channel
=
conv_param_
->
input_channel_
;
auto
output_channel
=
conv_param_
->
output_channel_
;
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
input_channel
=
filter_tensor
->
Channel
();
auto
output_channel
=
filter_tensor
->
Batch
();
conv_param_
->
input_channel_
=
input_channel
;
conv_param_
->
output_channel_
=
output_channel
;
int
iC4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
oC4
=
UP_DIV
(
output_channel
,
C4NUM
);
int
oc_block
,
oc_block_num
;
...
...
@@ -91,56 +94,35 @@ int Convolution3x3CPUKernel::InitWeightBias() {
}
int
Convolution3x3CPUKernel
::
InitTmpBuffer
()
{
int
iC4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
int
oC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
const
int
k_plane
=
16
;
/*=============================tile_buffer_============================*/
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
k_plane
*
iC4
*
C4NUM
*
sizeof
(
float
);
tile_buffer_
=
reinterpret_cast
<
float
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tile buffer failed."
;
return
RET_ERROR
;
}
memset
(
tile_buffer_
,
0
,
tile_buffer_size
);
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================block_unit_buffer_============================*/
size_t
block_unit_buffer_size
=
thread_count_
*
k_plane
*
C4NUM
*
sizeof
(
float
);
block_unit_buffer_
=
reinterpret_cast
<
float
*>
(
m
alloc
(
block_unit_buffer_size
));
block_unit_buffer_
=
reinterpret_cast
<
float
*>
(
ctx_
->
allocator
->
M
alloc
(
block_unit_buffer_size
));
if
(
block_unit_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc block_unit_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
block_unit_buffer_
,
0
,
block_unit_buffer_size
);
/*=============================tmp_dst_buffer_============================*/
size_t
tmp_dst_buffer_size
=
thread_count_
*
TILE_NUM
*
k_plane
*
oC4
*
C4NUM
*
sizeof
(
float
);
tmp_dst_buffer_
=
reinterpret_cast
<
float
*>
(
m
alloc
(
tmp_dst_buffer_size
));
tmp_dst_buffer_
=
reinterpret_cast
<
float
*>
(
ctx_
->
allocator
->
M
alloc
(
tmp_dst_buffer_size
));
if
(
tmp_dst_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_dst_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_dst_buffer_
,
0
,
tmp_dst_buffer_size
);
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
iC4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
/*=============================nc4hw4_out_============================*/
size_t
nc4hw4_out_size
=
oC4
*
C4NUM
*
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
sizeof
(
float
);
nc4hw4_out_
=
reinterpret_cast
<
float
*>
(
m
alloc
(
nc4hw4_out_size
));
nc4hw4_out_
=
reinterpret_cast
<
float
*>
(
ctx_
->
allocator
->
M
alloc
(
nc4hw4_out_size
));
if
(
nc4hw4_out_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nc4hw4_out_ failed."
;
return
RET_ERROR
;
}
memset
(
nc4hw4_out_
,
0
,
nc4hw4_out_size
);
tmp_buffer_address_list_
[
0
]
=
tile_buffer_
;
tmp_buffer_address_list_
[
1
]
=
block_unit_buffer_
;
tmp_buffer_address_list_
[
2
]
=
tmp_dst_buffer_
;
...
...
@@ -162,28 +144,57 @@ int Convolution3x3CPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed.ret: "
<<
ret
;
return
RET_ERROR
;
}
ConfigInputOutput
();
return
ReSize
();
}
int
Convolution3x3CPUKernel
::
ReSize
()
{
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
if
(
tile_buffer_
!=
nullptr
)
{
free
(
tile_buffer_
);
tile_buffer_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed.ret: "
<<
ret
;
return
RET_ERROR
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed.ret: "
<<
ret
;
int
iC4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
iC4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed.ret: "
<<
ret
;
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
/*=============================tile_buffer_============================*/
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
C16NUM
*
iC4
*
C4NUM
*
sizeof
(
float
);
tile_buffer_
=
reinterpret_cast
<
float
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tile buffer failed."
;
return
RET_ERROR
;
}
ConfigInputOutput
(
);
memset
(
tile_buffer_
,
0
,
tile_buffer_size
);
return
RET_OK
;
}
...
...
@@ -214,17 +225,21 @@ int Convolution3x3CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Prepare fail!ret: "
<<
prepare_ret
;
return
prepare_ret
;
}
auto
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed.ret: "
<<
ret
;
return
RET_ERROR
;
}
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
ori_input_data
=
input_tensor
->
Data
();
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
int
error_code
=
LiteBackendParallelLaunch
(
Convolution3x3Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv3x3 error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
...
...
@@ -241,6 +256,7 @@ int Convolution3x3CPUKernel::Run() {
PackNC4HW4ToNHWCFp32
(
nc4hw4_out_
,
output_addr
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
}
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
浏览文件 @
96df6f87
...
...
@@ -29,8 +29,15 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
lite
::
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
Convolution3x3CPUKernel
()
override
{
FreeTmpBuffer
();
}
~
Convolution3x3CPUKernel
()
override
{
if
(
transformed_filter_addr_
!=
nullptr
)
{
free
(
transformed_filter_addr_
);
}
if
(
tile_buffer_
!=
nullptr
)
{
free
(
tile_buffer_
);
tile_buffer_
=
nullptr
;
}
}
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
...
...
@@ -41,24 +48,16 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
private:
void
FreeTmpBuffer
()
{
if
(
tile_buffer_
!=
nullptr
)
{
free
(
tile_buffer_
);
tile_buffer_
=
nullptr
;
}
if
(
block_unit_buffer_
!=
nullptr
)
{
f
ree
(
block_unit_buffer_
);
ctx_
->
allocator
->
F
ree
(
block_unit_buffer_
);
block_unit_buffer_
=
nullptr
;
}
if
(
tmp_dst_buffer_
!=
nullptr
)
{
f
ree
(
tmp_dst_buffer_
);
ctx_
->
allocator
->
F
ree
(
tmp_dst_buffer_
);
tmp_dst_buffer_
=
nullptr
;
}
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
if
(
nc4hw4_out_
!=
nullptr
)
{
f
ree
(
nc4hw4_out_
);
ctx_
->
allocator
->
F
ree
(
nc4hw4_out_
);
nc4hw4_out_
=
nullptr
;
}
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
浏览文件 @
96df6f87
...
...
@@ -30,14 +30,17 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_Conv2D
;
int
ConvolutionSWCPUKernel
::
InitWeightBias
()
{
int
kernel_h
=
conv_param_
->
kernel_h_
;
int
kernel_w
=
conv_param_
->
kernel_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
int
out_channel
=
conv_param_
->
output_channel_
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
input_channel
=
filter_tensor
->
Channel
();
auto
output_channel
=
filter_tensor
->
Batch
();
int
kernel_h
=
filter_tensor
->
Height
();
int
kernel_w
=
filter_tensor
->
Width
();
conv_param_
->
input_channel_
=
input_channel
;
conv_param_
->
output_channel_
=
output_channel
;
int
ic4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
kernel_plane
=
kernel_h
*
kernel_w
;
int
oc_block
=
C4NUM
;
int
oc_block_num
=
UP_DIV
(
out_channel
,
C4NUM
);
int
oc_block_num
=
UP_DIV
(
out
put
_channel
,
C4NUM
);
int
pack_weight_size
=
oc_block_num
*
oc_block
*
ic4
*
C4NUM
*
kernel_plane
;
// ==================================init weight======================================//
...
...
@@ -48,13 +51,13 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
float
));
for
(
int
oc
=
0
;
oc
<
out_channel
;
++
oc
)
{
int
src_oc_offset
=
oc
*
kernel_h
*
kernel_w
*
in_channel
;
for
(
int
oc
=
0
;
oc
<
out
put
_channel
;
++
oc
)
{
int
src_oc_offset
=
oc
*
kernel_h
*
kernel_w
*
in
put
_channel
;
int
dst_oc_offset
=
oc
*
kernel_h
*
kernel_w
*
ic4
*
C4NUM
;
for
(
int
i
=
0
;
i
<
kernel_h
*
kernel_w
;
++
i
)
{
const
float
*
src
=
origin_weight
+
src_oc_offset
+
i
*
in_channel
;
const
float
*
src
=
origin_weight
+
src_oc_offset
+
i
*
in
put
_channel
;
float
*
dst
=
packed_weight_
+
dst_oc_offset
+
i
*
ic4
*
C4NUM
;
memcpy
(
dst
,
src
,
in_channel
*
sizeof
(
float
));
memcpy
(
dst
,
src
,
in
put
_channel
*
sizeof
(
float
));
}
}
...
...
@@ -67,7 +70,7 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
memset
(
bias_data_
,
0
,
oc_block_num
*
oc_block
*
sizeof
(
float
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
out_channel
*
sizeof
(
float
));
memcpy
(
bias_data_
,
ori_bias
,
out
put
_channel
*
sizeof
(
float
));
}
else
{
MS_ASSERT
(
in_tensors_
.
size
()
==
kInputSize1
);
}
...
...
@@ -75,24 +78,13 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
}
int
ConvolutionSWCPUKernel
::
InitTmpBuffer
()
{
int
in_channel
=
conv_param_
->
input_channel_
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
out_channel
=
conv_param_
->
output_channel_
;
int
oc4
=
UP_DIV
(
out_channel
,
C4NUM
);
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4 input failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================tmp_output_block_============================*/
tmp_output_block_
=
reinterpret_cast
<
float
*>
(
malloc
(
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
oc4
*
C4NUM
*
sizeof
(
float
)));
tmp_output_block_
=
reinterpret_cast
<
float
*>
(
ctx_
->
allocator
->
Malloc
(
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
oc4
*
C4NUM
*
sizeof
(
float
)));
if
(
tmp_output_block_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp output block failed."
;
return
RET_ERROR
;
...
...
@@ -110,39 +102,49 @@ int ConvolutionSWCPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
// config input output
ConfigInputOutput
();
return
ReSize
();
}
int
ConvolutionSWCPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
RET_ERROR
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
// init tmp input, output
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
/*=============================nhwc4_input_============================*/
int
ic4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4 input failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
// init sliding window param
slidingWindow_param_
=
new
SlidingWindowParam
;
InitSlidingParamConv
(
slidingWindow_param_
,
conv_param_
,
C4NUM
);
// config input output
ConfigInputOutput
();
return
RET_OK
;
}
...
...
@@ -169,20 +171,25 @@ int ConvolutionSWCPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Prepare fail!ret: "
<<
prepare_ret
;
return
prepare_ret
;
}
// init tmp input, output
auto
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
ori_input_data
=
input_tensor
->
Data
();
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
int
error_code
=
LiteBackendParallelLaunch
(
ConvolutionSWImpl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
// output nhwc4
auto
out_tensor
=
out_tensors_
.
front
();
auto
out_data
=
reinterpret_cast
<
float
*>
(
out_tensor
->
Data
());
int
oc4_res
=
conv_param_
->
output_channel_
%
C4NUM
;
...
...
@@ -190,6 +197,7 @@ int ConvolutionSWCPUKernel::Run() {
PackNHWC4ToNHWCFp32
(
tmp_output_block_
,
out_data
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
}
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.h
浏览文件 @
96df6f87
...
...
@@ -32,7 +32,12 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel {
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
ConvolutionSWCPUKernel
()
override
{
FreeTmpBuffer
();
}
~
ConvolutionSWCPUKernel
()
override
{
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
}
int
Init
()
override
;
int
ReSize
()
override
;
...
...
@@ -44,12 +49,8 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel {
private:
void
FreeTmpBuffer
()
{
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
if
(
tmp_output_block_
!=
nullptr
)
{
f
ree
(
tmp_output_block_
);
ctx_
->
allocator
->
F
ree
(
tmp_output_block_
);
tmp_output_block_
=
nullptr
;
}
if
(
slidingWindow_param_
!=
nullptr
)
{
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
浏览文件 @
96df6f87
...
...
@@ -30,12 +30,12 @@ using mindspore::schema::PrimitiveType_Conv2D;
namespace
mindspore
::
kernel
{
void
WinogradFilterTransform
(
const
float
*
weight_data
,
Matrix
*
trans_weight
,
int
kernel_unit
,
int
input_unit
,
ConvParameter
*
conv_param
,
int
oc_block
)
{
//
original weight format : ohwi
//
=============original weight format : ohwi===============//
auto
channel_in
=
conv_param
->
input_channel_
;
auto
channel_out
=
conv_param
->
output_channel_
;
int
input_unit_square
=
input_unit
*
input_unit
;
//
generate matrix_G && matrix_GT
//
=============generate matrix_G && matrix_GT===============//
auto
matrix_g
=
TransformMatrixGenerator
(
input_unit
,
kernel_unit
);
auto
matrix_gt
=
TransformMatrixGenerator
(
kernel_unit
,
input_unit
);
ChooseMatrixG
(
matrix_g
,
matrix_gt
);
...
...
@@ -95,15 +95,20 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
}
int
ConvolutionWinogradCPUKernel
::
InitWeightBias
()
{
int
output_channel
=
conv_param_
->
output_channel_
;
int
oc4
=
UP_DIV
(
output_channel
,
C4NUM
);
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
int
in_channel
=
filter_tensor
->
Channel
();
int
out_channel
=
filter_tensor
->
Batch
();
conv_param_
->
input_channel_
=
in_channel
;
conv_param_
->
output_channel_
=
out_channel
;
int
oc4
=
UP_DIV
(
out_channel
,
C4NUM
);
int
oc_block
,
oc_block_num
;
// #ifdef ENABLE_ARM32
// oc_block = C4NUM;
// oc_block_num = UP_DIV(output_channel, C4NUM);
// #else
oc_block
=
C8NUM
;
oc_block_num
=
UP_DIV
(
out
put
_channel
,
C8NUM
);
oc_block_num
=
UP_DIV
(
out_channel
,
C8NUM
);
// #endif
// init weight
...
...
@@ -112,8 +117,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
MS_LOG
(
ERROR
)
<<
"Malloc filter matrix failed."
;
return
RET_ERROR
;
}
auto
weight_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
weight_data
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
auto
weight_data
=
reinterpret_cast
<
float
*>
(
filter_tensor
->
Data
());
WinogradFilterTransform
(
weight_data
,
trans_weight_
,
kernel_unit_
,
input_unit_
,
conv_param_
,
oc_block
);
// init bias
...
...
@@ -122,7 +126,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
memset
(
bias_data_
,
0
,
new_bias_size
);
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias_addr
=
reinterpret_cast
<
float
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias_addr
,
out
put
_channel
*
sizeof
(
float
));
memcpy
(
bias_data_
,
ori_bias_addr
,
out_channel
*
sizeof
(
float
));
}
else
{
MS_ASSERT
(
in_tensors_
.
size
()
==
kInputSize1
);
}
...
...
@@ -167,25 +171,15 @@ int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_
}
int
ConvolutionWinogradCPUKernel
::
InitTmpBuffer
()
{
int
channel_in
=
conv_param_
->
input_channel_
;
int
channel_out
=
conv_param_
->
output_channel_
;
int
output_h
=
conv_param_
->
output_h_
;
int
output_w
=
conv_param_
->
output_w_
;
int
ic4
=
UP_DIV
(
channel_in
,
C4NUM
);
int
oc4
=
UP_DIV
(
channel_out
,
C4NUM
);
/*=============================trans_input_============================*/
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
input_unit_
*
input_unit_
*
ic4
*
C4NUM
*
sizeof
(
float
);
trans_input_
=
reinterpret_cast
<
float
*>
(
malloc
(
tile_buffer_size
));
if
(
trans_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc trans_input_ failed."
;
return
RET_ERROR
;
}
memset
(
trans_input_
,
0
,
tile_buffer_size
);
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================gemm_out_============================*/
gemm_out_
=
reinterpret_cast
<
float
*>
(
m
alloc
(
thread_count_
*
TILE_NUM
*
input_unit_
*
input_unit_
*
oc4
*
C4NUM
*
sizeof
(
float
)));
ctx_
->
allocator
->
M
alloc
(
thread_count_
*
TILE_NUM
*
input_unit_
*
input_unit_
*
oc4
*
C4NUM
*
sizeof
(
float
)));
if
(
gemm_out_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc gemm_out_ failed."
;
return
RET_ERROR
;
...
...
@@ -194,35 +188,26 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
/*=============================tmp_out_data_============================*/
int
out_w_block
=
UP_DIV
(
output_w
,
output_unit_
);
int
out_h_block
=
UP_DIV
(
output_h
,
output_unit_
);
tmp_out_data_
=
reinterpret_cast
<
float
*>
(
malloc
(
conv_param_
->
output_batch_
*
out_w_block
*
out_h_block
*
output_unit_
*
output_unit_
*
oc4
*
C4NUM
*
sizeof
(
float
)));
tmp_out_data_
=
reinterpret_cast
<
float
*>
(
ctx_
->
allocator
->
Malloc
(
conv_param_
->
output_batch_
*
out_w_block
*
out_h_block
*
output_unit_
*
output_unit_
*
oc4
*
C4NUM
*
sizeof
(
float
)));
if
(
tmp_out_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_out_data_ failed."
;
return
RET_ERROR
;
}
/*=============================tmp_data_============================*/
tmp_data_
=
reinterpret_cast
<
float
*>
(
malloc
(
thread_count_
*
C4NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float
)));
tmp_data_
=
reinterpret_cast
<
float
*>
(
ctx_
->
allocator
->
Malloc
(
thread_count_
*
C4NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float
)));
if
(
tmp_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_data_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_data_
,
0
,
C4NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float
));
tmp_buffer_address_list_
[
0
]
=
trans_input_
;
tmp_buffer_address_list_
[
1
]
=
gemm_out_
;
tmp_buffer_address_list_
[
2
]
=
tmp_out_data_
;
tmp_buffer_address_list_
[
3
]
=
tmp_data_
;
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
return
RET_OK
;
}
...
...
@@ -253,37 +238,67 @@ int ConvolutionWinogradCPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
kernel_unit_
=
conv_param_
->
kernel_h_
;
input_unit_
=
output_unit_
+
kernel_unit_
-
1
;
conv_param_
->
input_unit_
=
input_unit_
;
conv_param_
->
output_unit_
=
output_unit_
;
auto
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
return
ReSize
();
}
int
ConvolutionWinogradCPUKernel
::
ReSize
()
{
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
if
(
trans_input_
!=
nullptr
)
{
free
(
trans_input_
);
trans_input_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
RET_ERROR
;
}
kernel_unit_
=
conv_param_
->
kernel_h_
;
input_unit_
=
output_unit_
+
kernel_unit_
-
1
;
conv_param_
->
input_unit_
=
input_unit_
;
conv_param_
->
output_unit_
=
output_unit_
;
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
/*=============================nhwc4_input_============================*/
int
ic4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float
);
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
// malloc tmp buffer
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
/*=============================trans_input_============================*/
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
input_unit_
*
input_unit_
*
ic4
*
C4NUM
*
sizeof
(
float
);
trans_input_
=
reinterpret_cast
<
float
*>
(
malloc
(
tile_buffer_size
));
if
(
trans_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc trans_input_ failed."
;
return
RET_ERROR
;
}
memset
(
trans_input_
,
0
,
tile_buffer_size
);
ret
=
ConfigInputOutput
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConfigInputOutput failed."
;
...
...
@@ -319,17 +334,21 @@ int ConvolutionWinogradCPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Prepare fail!ret: "
<<
prepare_ret
;
return
prepare_ret
;
}
// malloc tmp buffer
auto
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
ori_input_data
=
input_tensor
->
Data
();
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
PackNHWCToNHWC4Fp32
(
ori_input_data
,
nhwc4_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
int
error_code
=
LiteBackendParallelLaunch
(
ConvolutionWinogradImpl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv winograd error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
...
...
@@ -346,6 +365,7 @@ int ConvolutionWinogradCPUKernel::Run() {
UnPackWinogradOutput
(
tmp_out_data_
,
out_data
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
,
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
,
output_unit_
);
}
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
浏览文件 @
96df6f87
...
...
@@ -30,10 +30,18 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
ConvolutionWinogradCPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
lite
::
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
,
int
output_unit
)
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
),
output_unit_
(
output_unit
),
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
),
output_unit_
(
output_unit
),
trans_weight_
(
nullptr
)
{}
~
ConvolutionWinogradCPUKernel
()
override
{
FreeTmpBuffer
();
if
(
trans_weight_
!=
nullptr
)
{
delete
trans_weight_
;
trans_weight_
=
nullptr
;
}
if
(
trans_input_
!=
nullptr
)
{
free
(
trans_input_
);
trans_input_
=
nullptr
;
}
};
int
Init
()
override
;
int
ReSize
()
override
;
...
...
@@ -47,25 +55,17 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
private:
void
FreeTmpBuffer
()
{
if
(
tmp_data_
!=
nullptr
)
{
f
ree
(
tmp_data_
);
ctx_
->
allocator
->
F
ree
(
tmp_data_
);
tmp_data_
=
nullptr
;
}
if
(
trans_input_
!=
nullptr
)
{
free
(
trans_input_
);
trans_input_
=
nullptr
;
}
if
(
gemm_out_
!=
nullptr
)
{
f
ree
(
gemm_out_
);
ctx_
->
allocator
->
F
ree
(
gemm_out_
);
gemm_out_
=
nullptr
;
}
if
(
tmp_out_data_
!=
nullptr
)
{
f
ree
(
tmp_out_data_
);
ctx_
->
allocator
->
F
ree
(
tmp_out_data_
);
tmp_out_data_
=
nullptr
;
}
if
(
trans_weight_
!=
nullptr
)
{
delete
trans_weight_
;
trans_weight_
=
nullptr
;
}
}
int
kernel_unit_
;
int
input_unit_
;
...
...
mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
浏览文件 @
96df6f87
...
...
@@ -44,6 +44,21 @@ void ProcessFilterUint8(int8_t *origin_weight, int16_t *dst_weight, ConvParamete
}
void
Convolution3x3Int8CPUKernel
::
FreeTmpBuffer
()
{
if
(
block_unit_buffer_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
block_unit_buffer_
);
block_unit_buffer_
=
nullptr
;
}
if
(
tmp_dst_buffer_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
tmp_dst_buffer_
);
tmp_dst_buffer_
=
nullptr
;
}
if
(
tmp_out_
!=
nullptr
)
{
ctx_
->
allocator
->
Free
(
tmp_out_
);
tmp_out_
=
nullptr
;
}
}
Convolution3x3Int8CPUKernel
::~
Convolution3x3Int8CPUKernel
()
{
if
(
transformed_filter_addr_
!=
nullptr
)
{
free
(
transformed_filter_addr_
);
transformed_filter_addr_
=
nullptr
;
...
...
@@ -56,26 +71,15 @@ void Convolution3x3Int8CPUKernel::FreeTmpBuffer() {
free
(
tile_buffer_
);
tile_buffer_
=
nullptr
;
}
if
(
block_unit_buffer_
!=
nullptr
)
{
free
(
block_unit_buffer_
);
block_unit_buffer_
=
nullptr
;
}
if
(
tmp_dst_buffer_
!=
nullptr
)
{
free
(
tmp_dst_buffer_
);
tmp_dst_buffer_
=
nullptr
;
}
if
(
tmp_out_
!=
nullptr
)
{
free
(
tmp_out_
);
tmp_out_
=
nullptr
;
}
FreeQuantParam
();
}
Convolution3x3Int8CPUKernel
::~
Convolution3x3Int8CPUKernel
()
{
FreeTmpBuffer
();
}
int
Convolution3x3Int8CPUKernel
::
InitWeightBias
()
{
auto
input_channel
=
conv_param_
->
input_channel_
;
auto
output_channel
=
conv_param_
->
output_channel_
;
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
input_channel
=
filter_tensor
->
Channel
();
auto
output_channel
=
filter_tensor
->
Batch
();
conv_param_
->
input_channel_
=
input_channel
;
conv_param_
->
output_channel_
=
output_channel
;
int
iC8
=
UP_DIV
(
input_channel
,
C8NUM
);
int
oC4
=
UP_DIV
(
output_channel
,
C4NUM
);
// init weight
...
...
@@ -107,59 +111,35 @@ int Convolution3x3Int8CPUKernel::InitWeightBias() {
}
int
Convolution3x3Int8CPUKernel
::
InitTmpBuffer
()
{
int
ic8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
);
int
oc4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
in_batch
=
conv_param_
->
input_batch_
;
int
input_w
=
conv_param_
->
input_w_
;
int
input_h
=
conv_param_
->
input_h_
;
int
output_batch
=
conv_param_
->
output_batch_
;
int
output_w
=
conv_param_
->
output_w_
;
int
output_h
=
conv_param_
->
output_h_
;
/*=============================tile_buffer_============================*/
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
16
*
ic8
*
C8NUM
*
sizeof
(
int16_t
);
tile_buffer_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tile_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
tile_buffer_
,
0
,
tile_buffer_size
);
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================block_unit_buffer_============================*/
size_t
block_unit_buffer_size
=
thread_count_
*
4
*
4
*
C8NUM
*
sizeof
(
int16_t
);
block_unit_buffer_
=
reinterpret_cast
<
int16_t
*>
(
m
alloc
(
block_unit_buffer_size
));
block_unit_buffer_
=
reinterpret_cast
<
int16_t
*>
(
ctx_
->
allocator
->
M
alloc
(
block_unit_buffer_size
));
if
(
block_unit_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc block_unit_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
block_unit_buffer_
,
0
,
block_unit_buffer_size
);
/*=============================tmp_dst_buffer_============================*/
size_t
tmp_dst_buffer_size
=
thread_count_
*
TILE_NUM
*
16
*
oc4
*
C4NUM
*
sizeof
(
int32_t
);
tmp_dst_buffer_
=
reinterpret_cast
<
int32_t
*>
(
m
alloc
(
tmp_dst_buffer_size
));
tmp_dst_buffer_
=
reinterpret_cast
<
int32_t
*>
(
ctx_
->
allocator
->
M
alloc
(
tmp_dst_buffer_size
));
if
(
tmp_dst_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_dst_buffer_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_dst_buffer_
,
0
,
tmp_dst_buffer_size
);
/*=============================tmp_out_============================*/
size_t
tmp_out_size
=
oc4
*
C4NUM
*
output_batch
*
output_w
*
output_h
*
sizeof
(
uint8_t
);
tmp_out_
=
reinterpret_cast
<
int8_t
*>
(
m
alloc
(
tmp_out_size
));
tmp_out_
=
reinterpret_cast
<
int8_t
*>
(
ctx_
->
allocator
->
M
alloc
(
tmp_out_size
));
if
(
tmp_out_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_out_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_out_
,
0
,
tmp_out_size
);
/*=============================input_data_============================*/
size_t
c8_input_size
=
in_batch
*
input_h
*
input_w
*
ic8
*
C8NUM
*
sizeof
(
int16_t
);
input_data_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
c8_input_size
));
if
(
input_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc input_data_ failed."
;
return
RET_ERROR
;
}
memset
(
input_data_
,
0
,
c8_input_size
);
return
RET_OK
;
}
...
...
@@ -172,35 +152,63 @@ int Convolution3x3Int8CPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
auto
ret
=
SetQuantParam
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Set quant param failed."
;
return
ret
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
// config input output
ConfigInputOutput
();
return
ReSize
();
}
int
Convolution3x3Int8CPUKernel
::
ReSize
()
{
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Resize is invalid."
;
return
ret
;
}
FreeTmpBuffer
();
if
(
input_data_
!=
nullptr
)
{
free
(
input_data_
);
input_data_
=
nullptr
;
}
if
(
tile_buffer_
!=
nullptr
)
{
free
(
tile_buffer_
);
tile_buffer_
=
nullptr
;
}
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
RET_ERROR
;
}
ret
=
SetQuantParam
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Set quant param failed."
;
return
ret
;
}
ret
=
InitWeightBias
(
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"
Init weight bias
failed."
;
/*=============================input_data_============================*/
int
ic8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8NUM
)
;
size_t
c8_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
ic8
*
C8NUM
*
sizeof
(
int16_t
);
input_data_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
c8_input_size
)
);
if
(
input_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"
malloc input_data_
failed."
;
return
RET_ERROR
;
}
// init tmp input, output
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
memset
(
input_data_
,
0
,
c8_input_size
);
/*=============================tile_buffer_============================*/
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
C16NUM
*
ic8
*
C8NUM
*
sizeof
(
int16_t
);
tile_buffer_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tile_buffer_ failed."
;
return
RET_ERROR
;
}
// config input output
ConfigInputOutput
();
memset
(
tile_buffer_
,
0
,
tile_buffer_size
);
return
RET_OK
;
}
...
...
@@ -227,12 +235,19 @@ int Convolution3x3Int8CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Prepare failed."
;
return
RET_ERROR
;
}
// malloc tmp buffer
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
auto
input_addr
=
reinterpret_cast
<
int8_t
*>
(
in_tensors_
.
at
(
kInputIndex
)
->
Data
());
PackInputToC8Int8
(
input_addr
,
input_data_
,
conv_param_
);
int
error_code
=
LiteBackendParallelLaunch
(
Convolution3x3Int8Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv3x3 int8 error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
// get real output
...
...
@@ -240,6 +255,7 @@ int Convolution3x3Int8CPUKernel::Run() {
auto
out_data
=
reinterpret_cast
<
int8_t
*>
(
out_tensor
->
Data
());
PackNC4HW4ToNHWCInt8
(
tmp_out_
,
out_data
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
FreeTmpBuffer
();
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
浏览文件 @
96df6f87
...
...
@@ -60,12 +60,15 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() {
}
int
ConvolutionInt8CPUKernel
::
InitWeightBias
()
{
int
kernel_h
=
conv_param_
->
kernel_h_
;
int
kernel_w
=
conv_param_
->
kernel_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
out_channel
=
conv_param_
->
output_channel_
;
int
oc4
=
UP_DIV
(
out_channel
,
C4NUM
);
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
input_channel
=
filter_tensor
->
Channel
();
auto
output_channel
=
filter_tensor
->
Batch
();
int
kernel_h
=
filter_tensor
->
Height
();
int
kernel_w
=
filter_tensor
->
Width
();
conv_param_
->
input_channel_
=
input_channel
;
conv_param_
->
output_channel_
=
output_channel
;
int
ic4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
oc4
=
UP_DIV
(
output_channel
,
C4NUM
);
int
kernel_plane
=
kernel_h
*
kernel_w
;
int
plane_c4
=
UP_DIV
(
kernel_plane
,
C4NUM
);
int
pack_weight_size
=
oc4
*
ic4
*
C4NUM
*
C4NUM
*
plane_c4
*
C4NUM
;
...
...
@@ -80,8 +83,8 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
);
auto
*
weight_sum
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
sizeof
(
int32_t
)
*
out_channel
));
for
(
int
i
=
0
;
i
<
out_channel
;
i
++
)
weight_sum
[
i
]
=
0
;
auto
*
weight_sum
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
sizeof
(
int32_t
)
*
out
put
_channel
));
for
(
int
i
=
0
;
i
<
out
put
_channel
;
i
++
)
weight_sum
[
i
]
=
0
;
PackWeightInt8
(
origin_weight
,
conv_param_
,
packed_weight_
,
weight_sum
);
// init bias
...
...
@@ -93,42 +96,22 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
memset
(
bias_data_
,
0
,
oc4
*
C4NUM
*
sizeof
(
int32_t
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
out_channel
*
sizeof
(
int32_t
));
memcpy
(
bias_data_
,
ori_bias
,
out
put
_channel
*
sizeof
(
int32_t
));
}
else
{
MS_ASSERT
(
in_tensors_
.
size
()
==
kInputSize1
);
}
auto
*
bias_data
=
reinterpret_cast
<
int32_t
*>
(
bias_data_
);
int
c4_kernel_plane_size
=
kernel_plane
*
ic4
*
C4NUM
;
if
(
conv_quant_arg_
->
per_channel_
&
FILTER_PER_CHANNEL
)
{
for
(
int
i
=
0
;
i
<
out_channel
;
i
++
)
{
for
(
int
i
=
0
;
i
<
out
put
_channel
;
i
++
)
{
bias_data
[
i
]
+=
filter_arg
[
i
].
zp_
*
input_zp
*
c4_kernel_plane_size
-
weight_sum
[
i
]
*
input_zp
;
}
}
else
{
for
(
int
i
=
0
;
i
<
out_channel
;
i
++
)
{
for
(
int
i
=
0
;
i
<
out
put
_channel
;
i
++
)
{
bias_data
[
i
]
+=
filter_arg
[
0
].
zp_
*
input_zp
*
c4_kernel_plane_size
-
weight_sum
[
i
]
*
input_zp
;
}
}
free
(
weight_sum
);
return
RET_OK
;
}
int
ConvolutionInt8CPUKernel
::
InitTmpBuffer
()
{
int
output_count
=
conv_param_
->
output_h_
*
conv_param_
->
output_w_
;
int
output_tile_count
=
UP_DIV
(
output_count
,
tile_num_
);
int
in_channel
=
conv_param_
->
input_channel_
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
kernel_plane
=
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
int
plane_c4
=
UP_DIV
(
kernel_plane
,
C4NUM
);
int
unit_size
=
plane_c4
*
C4NUM
*
ic4
*
C4NUM
;
int
packed_input_size
=
output_tile_count
*
tile_num_
*
unit_size
;
/*=============================packed_input_============================*/
packed_input_
=
reinterpret_cast
<
int8_t
*>
(
malloc
(
conv_param_
->
input_batch_
*
packed_input_size
));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc packed_input_ failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
conv_param_
->
input_batch_
*
packed_input_size
);
/*=============================input_sum_============================*/
size_t
input_sum_size
;
...
...
@@ -137,47 +120,45 @@ int ConvolutionInt8CPUKernel::InitTmpBuffer() {
}
else
{
input_sum_size
=
tile_num_
*
thread_count_
*
sizeof
(
int32_t
);
}
input_sum_
=
reinterpret_cast
<
int32_t
*>
(
m
alloc
(
input_sum_size
));
input_sum_
=
reinterpret_cast
<
int32_t
*>
(
ctx_
->
allocator
->
M
alloc
(
input_sum_size
));
if
(
input_sum_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc input_sum_ failed."
;
return
RET_ERROR
;
}
memset
(
input_sum_
,
0
,
tile_num_
*
thread_count_
*
sizeof
(
int32_t
));
return
RET_OK
;
}
int
ConvolutionInt8CPUKernel
::
InitTmpBuffer
()
{
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================tmp_dst_============================*/
size_t
tmp_dst_size
=
thread_count_
*
tile_num_
*
conv_param_
->
output_channel_
*
sizeof
(
int32_t
);
tmp_dst_
=
reinterpret_cast
<
int32_t
*>
(
m
alloc
(
tmp_dst_size
));
tmp_dst_
=
reinterpret_cast
<
int32_t
*>
(
ctx_
->
allocator
->
M
alloc
(
tmp_dst_size
));
if
(
tmp_dst_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_dst_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_dst_
,
0
,
tmp_dst_size
);
/*=============================tmp_out_============================*/
tmp_out_
=
reinterpret_cast
<
int8_t
*>
(
malloc
(
thread_count_
*
tile_num_
*
conv_param_
->
output_channel_
));
tmp_out_
=
reinterpret_cast
<
int8_t
*>
(
ctx_
->
allocator
->
Malloc
(
thread_count_
*
tile_num_
*
conv_param_
->
output_channel_
));
if
(
tmp_out_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_out_ failed."
;
return
RET_ERROR
;
}
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
;
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4 input failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
return
RET_OK
;
}
int
ConvolutionInt8CPUKernel
::
InitWeightBiasOpt
()
{
int
kernel_h
=
conv_param_
->
kernel_h_
;
int
kernel_w
=
conv_param_
->
kernel_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
out_channel
=
conv_param_
->
output_channel_
;
int
oc4
=
UP_DIV
(
out_channel
,
C4NUM
);
auto
filter_tensor
=
in_tensors_
.
at
(
kWeightIndex
);
auto
input_channel
=
filter_tensor
->
Channel
();
auto
output_channel
=
filter_tensor
->
Batch
();
int
kernel_h
=
filter_tensor
->
Height
();
int
kernel_w
=
filter_tensor
->
Width
();
conv_param_
->
input_channel_
=
input_channel
;
conv_param_
->
output_channel_
=
output_channel
;
int
ic4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
oc4
=
UP_DIV
(
output_channel
,
C4NUM
);
int
kernel_plane
=
kernel_h
*
kernel_w
;
int
pack_weight_size
=
oc4
*
ic4
*
C4NUM
*
C4NUM
*
kernel_plane
;
auto
filter_arg
=
conv_param_
->
conv_quant_arg_
.
filter_quant_args_
;
...
...
@@ -191,8 +172,8 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
);
auto
*
weight_sum
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
sizeof
(
int32_t
)
*
out_channel
));
for
(
int
i
=
0
;
i
<
out_channel
;
i
++
)
weight_sum
[
i
]
=
0
;
auto
*
weight_sum
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
sizeof
(
int32_t
)
*
out
put
_channel
));
for
(
int
i
=
0
;
i
<
out
put
_channel
;
i
++
)
weight_sum
[
i
]
=
0
;
PackWeightInt8Opt
(
origin_weight
,
conv_param_
,
packed_weight_
,
weight_sum
);
// init bias
...
...
@@ -204,41 +185,22 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
memset
(
bias_data_
,
0
,
oc4
*
C4NUM
*
sizeof
(
int32_t
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
out_channel
*
sizeof
(
int32_t
));
memcpy
(
bias_data_
,
ori_bias
,
out
put
_channel
*
sizeof
(
int32_t
));
}
else
{
MS_ASSERT
(
in_tensors_
.
size
()
==
kInputSize1
);
}
auto
*
bias_data
=
reinterpret_cast
<
int32_t
*>
(
bias_data_
);
int
c4_kernel_plane_size
=
kernel_plane
*
ic4
*
C4NUM
;
if
(
conv_quant_arg_
->
per_channel_
&
FILTER_PER_CHANNEL
)
{
for
(
int
i
=
0
;
i
<
out_channel
;
i
++
)
{
for
(
int
i
=
0
;
i
<
out
put
_channel
;
i
++
)
{
bias_data
[
i
]
+=
filter_arg
[
i
].
zp_
*
input_zp
*
c4_kernel_plane_size
-
weight_sum
[
i
]
*
input_zp
;
}
}
else
{
for
(
int
i
=
0
;
i
<
out_channel
;
i
++
)
{
for
(
int
i
=
0
;
i
<
out
put
_channel
;
i
++
)
{
bias_data
[
i
]
+=
filter_arg
[
0
].
zp_
*
input_zp
*
c4_kernel_plane_size
-
weight_sum
[
i
]
*
input_zp
;
}
}
free
(
weight_sum
);
return
RET_OK
;
}
int
ConvolutionInt8CPUKernel
::
InitTmpBufferOpt
()
{
int
output_count
=
conv_param_
->
output_h_
*
conv_param_
->
output_w_
;
int
output_tile_count
=
UP_DIV
(
output_count
,
tile_num_
);
int
in_channel
=
conv_param_
->
input_channel_
;
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
kernel_plane
=
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
int
unit_size
=
kernel_plane
*
ic4
*
C4NUM
;
int
packed_input_size
=
output_tile_count
*
tile_num_
*
unit_size
;
/*=============================packed_input_============================*/
packed_input_
=
reinterpret_cast
<
int8_t
*>
(
malloc
(
conv_param_
->
input_batch_
*
packed_input_size
));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc packed_input_ failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
conv_param_
->
input_batch_
*
packed_input_size
);
/*=============================input_sum_============================*/
size_t
input_sum_size
;
...
...
@@ -253,31 +215,26 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
return
RET_ERROR
;
}
memset
(
input_sum_
,
0
,
tile_num_
*
thread_count_
*
sizeof
(
int32_t
));
return
RET_OK
;
}
int
ConvolutionInt8CPUKernel
::
InitTmpBufferOpt
()
{
MS_ASSERT
(
ctx_
->
allocator
!=
nullptr
);
/*=============================tmp_dst_============================*/
size_t
tmp_dst_size
=
thread_count_
*
tile_num_
*
conv_param_
->
output_channel_
*
sizeof
(
int32_t
);
tmp_dst_
=
reinterpret_cast
<
int32_t
*>
(
m
alloc
(
tmp_dst_size
));
tmp_dst_
=
reinterpret_cast
<
int32_t
*>
(
ctx_
->
allocator
->
M
alloc
(
tmp_dst_size
));
if
(
tmp_dst_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_dst_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_dst_
,
0
,
tmp_dst_size
);
/*=============================tmp_out_============================*/
tmp_out_
=
reinterpret_cast
<
int8_t
*>
(
malloc
(
thread_count_
*
tile_num_
*
conv_param_
->
output_channel_
));
tmp_out_
=
reinterpret_cast
<
int8_t
*>
(
ctx_
->
allocator
->
Malloc
(
thread_count_
*
tile_num_
*
conv_param_
->
output_channel_
));
if
(
tmp_out_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_out_ failed."
;
return
RET_ERROR
;
}
/*=============================nhwc4_input_============================*/
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
;
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4 input failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
return
RET_OK
;
}
...
...
@@ -296,62 +253,79 @@ int ConvolutionInt8CPUKernel::Init() {
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
ConvolutionInt8CPUKernel
::
InitOpt
()
{
auto
ret
=
InitWeightBiasOpt
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
// init tmp input, output
ret
=
InitTmpBufferOpt
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
ConvolutionInt8CPUKernel
::
ReSize
()
{
FreeTmpBuffer
();
auto
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
RET_ERROR
;
}
// config input output
ConfigInputOutput
();
CheckSupportOptimize
();
ret
=
SetQuantParam
();
auto
ret
=
SetQuantParam
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Set quant param failed."
;
return
ret
;
}
// init for opt
if
(
support_optimize_
)
{
ret
=
InitOpt
();
ret
=
Init
WeightBias
Opt
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Initialization for optimized int8 conv failed."
;
return
RET_ERROR
;
}
return
RET_OK
;
}
else
{
// init for situation that not support sdot
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init weight bias failed."
;
return
RET_ERROR
;
}
}
// init for situation that not support sdot
ret
=
InitWeightBias
();
return
ReSize
();
}
int
ConvolutionInt8CPUKernel
::
ReSize
()
{
auto
ret
=
ConvolutionBaseCPUKernel
::
CheckResizeValid
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"
Init weight bias faile
d."
;
return
RET_ERROR
;
MS_LOG
(
ERROR
)
<<
"
Resize is invali
d."
;
return
ret
;
}
// init tmp input, output
ret
=
InitTmpBuffer
();
FreeTmpBuffer
();
if
(
nhwc4_input_
!=
nullptr
)
{
free
(
nhwc4_input_
);
nhwc4_input_
=
nullptr
;
}
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
ret
=
ConvolutionBaseCPUKernel
::
Init
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
MS_LOG
(
ERROR
)
<<
"ConvolutionBase init failed."
;
return
RET_ERROR
;
}
/*=============================nhwc4_input_============================*/
int
ic4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
size_t
nhwc4_input_size
=
ic4
*
C4NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
;
nhwc4_input_
=
malloc
(
nhwc4_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4 input failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc4_input_size
);
/*=============================packed_input_============================*/
int
output_count
=
conv_param_
->
output_h_
*
conv_param_
->
output_w_
;
int
output_tile_count
=
UP_DIV
(
output_count
,
tile_num_
);
int
kernel_plane
=
conv_param_
->
kernel_h_
*
conv_param_
->
kernel_w_
;
int
plane_c4
=
UP_DIV
(
kernel_plane
,
C4NUM
);
int
unit_size
=
plane_c4
*
C4NUM
*
ic4
*
C4NUM
;
int
packed_input_size
=
output_tile_count
*
tile_num_
*
unit_size
;
packed_input_
=
reinterpret_cast
<
int8_t
*>
(
malloc
(
conv_param_
->
input_batch_
*
packed_input_size
));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc packed_input_ failed."
;
return
RET_ERROR
;
}
memset
(
packed_input_
,
0
,
conv_param_
->
input_batch_
*
packed_input_size
);
return
RET_OK
;
}
...
...
@@ -369,7 +343,7 @@ int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
return
RET_OK
;
}
int
ConvolutionInt8Impl
(
int
task_id
,
LiteParallelGroupEnv
*
penv
,
void
*
cdata
)
{
int
ConvolutionInt8Impl
(
int
task_id
,
LiteParallelGroupEnv
*
m
penv
,
void
*
cdata
)
{
auto
conv
=
reinterpret_cast
<
ConvolutionInt8CPUKernel
*>
(
cdata
);
auto
error_code
=
conv
->
RunImpl
(
task_id
);
if
(
error_code
!=
RET_OK
)
{
...
...
@@ -385,19 +359,33 @@ int ConvolutionInt8CPUKernel::Run() {
MS_LOG
(
ERROR
)
<<
"Prepare failed."
;
return
RET_ERROR
;
}
if
(
support_optimize_
)
{
ret
=
InitTmpBufferOpt
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
}
else
{
// init tmp input, output
ret
=
InitTmpBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
}
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
ori_input_data
=
input_tensor
->
Data
();
int
in_batch
=
conv_param_
->
input_batch_
;
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
convert_func_
(
ori_input_data
,
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
convert_func_
(
ori_input_data
,
nhwc4_input_
,
conv_param_
->
input_batch_
,
conv_param_
->
input_h_
*
conv_param_
->
input_w_
,
conv_param_
->
input_channel_
);
int
error_code
=
LiteBackendParallelLaunch
(
ConvolutionInt8Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"conv int8 error error_code["
<<
error_code
<<
"]"
;
FreeTmpBuffer
();
return
RET_ERROR
;
}
FreeTmpBuffer
();
return
RET_OK
;
}
...
...
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
浏览文件 @
96df6f87
...
...
@@ -30,14 +30,27 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
ConvolutionInt8CPUKernel
()
override
{
FreeTmpBuffer
();
}
~
ConvolutionInt8CPUKernel
()
override
{
FreeQuantParam
();
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
if
(
input_sum_
!=
nullptr
)
{
free
(
input_sum_
);
input_sum_
=
nullptr
;
}
}
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
RunImpl
(
int
task_id
);
void
CheckSupportOptimize
();
int
InitOpt
();
int
InitWeightBiasOpt
();
int
InitTmpBufferOpt
();
int
InitWeightBias
();
...
...
@@ -46,27 +59,14 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
private:
void
FreeTmpBuffer
()
{
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
if
(
packed_input_
!=
nullptr
)
{
free
(
packed_input_
);
packed_input_
=
nullptr
;
}
if
(
input_sum_
!=
nullptr
)
{
free
(
input_sum_
);
input_sum_
=
nullptr
;
}
if
(
tmp_dst_
!=
nullptr
)
{
f
ree
(
tmp_dst_
);
ctx_
->
allocator
->
F
ree
(
tmp_dst_
);
tmp_dst_
=
nullptr
;
}
if
(
tmp_out_
!=
nullptr
)
{
f
ree
(
tmp_out_
);
ctx_
->
allocator
->
F
ree
(
tmp_out_
);
tmp_out_
=
nullptr
;
}
FreeQuantParam
();
}
bool
support_optimize_
=
true
;
int8_t
*
packed_weight_
=
nullptr
;
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/pack.c
浏览文件 @
96df6f87
...
...
@@ -228,10 +228,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
#ifdef ENABLE_NEON
vst1q_f32
(
packed_input
+
channel_block_offset
,
vld1q_f32
(
input_data
+
channel_block_stride
));
#else
(
packed_input
+
channel_block_offset
)[
0
]
=
(
input_data
+
channel_block_stride
)[
0
];
(
packed_input
+
channel_block_offset
)[
1
]
=
(
input_data
+
channel_block_stride
)[
1
];
(
packed_input
+
channel_block_offset
)[
2
]
=
(
input_data
+
channel_block_stride
)[
2
];
(
packed_input
+
channel_block_offset
)[
3
]
=
(
input_data
+
channel_block_stride
)[
3
];
for
(
int
k
=
0
;
k
<
C4NUM
;
++
k
)
{
(
packed_input
+
channel_block_offset
)[
k
]
=
(
input_data
+
channel_block_stride
)[
k
];
}
#endif
}
// channel_block loop
}
// kernel_w loop
...
...
@@ -349,10 +348,9 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
for
(
int
m
=
0
;
m
<
ic4
;
m
++
)
{
int
channel_block_stride
=
input_x_stride
+
m
*
C4NUM
;
int
channel_block_offset
=
input_plane_offset
+
m
*
tile_num
*
C4NUM
;
(
packed_input
+
channel_block_offset
)[
0
]
=
(
input_data
+
channel_block_stride
)[
0
];
(
packed_input
+
channel_block_offset
)[
1
]
=
(
input_data
+
channel_block_stride
)[
1
];
(
packed_input
+
channel_block_offset
)[
2
]
=
(
input_data
+
channel_block_stride
)[
2
];
(
packed_input
+
channel_block_offset
)[
3
]
=
(
input_data
+
channel_block_stride
)[
3
];
for
(
int
k
=
0
;
k
<
C4NUM
;
k
++
)
{
(
packed_input
+
channel_block_offset
)[
k
]
=
(
input_data
+
channel_block_stride
)[
k
];
}
}
// channel_block loop
}
// kernel_w loop
}
// kernel_h loop
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录