Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
e6157bf1
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e6157bf1
编写于
8月 03, 2020
作者:
F
fuzhiye
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
extract layout tranform func for fp16 op
上级
ecb87385
变更
22
隐藏空白更改
内联
并排
Showing
22 changed file
with
207 addition
and
95 deletion
+207
-95
mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc
...pore/lite/src/runtime/kernel/arm/base/layout_transform.cc
+0
-10
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
.../lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+6
-4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
...pore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+6
-4
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc
...lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc
+39
-0
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h
.../lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h
+27
-0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+12
-2
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
+2
-1
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
...spore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+23
-11
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+2
-3
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
.../lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+28
-14
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
...e/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+3
-3
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
+6
-1
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
+3
-0
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc
+13
-11
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h
+8
-3
mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
...pore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
+3
-1
mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h
...ore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h
+4
-4
mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
+10
-11
mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
+2
-1
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc
.../lite/src/runtime/kernel/arm/opclib/winograd_transform.cc
+7
-8
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h
...e/lite/src/runtime/kernel/arm/opclib/winograd_transform.h
+2
-2
mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
.../lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
+1
-1
未找到文件。
mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc
浏览文件 @
e6157bf1
...
...
@@ -19,12 +19,6 @@
using
mindspore
::
schema
::
Format
;
namespace
mindspore
::
kernel
{
#ifdef ENABLE_FP16
LayoutConvertor
LayoutTransformFp16
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
)
{
// todo
return
nullptr
;
}
#endif
LayoutConvertor
LayoutTransformFp32
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
)
{
// todo
if
(
src_format
==
schema
::
Format_NHWC
&&
dst_format
==
schema
::
Format_NC4HW4
)
{
...
...
@@ -58,10 +52,6 @@ LayoutConvertor LayoutTransform(TypeId data_type, schema::Format src_format, sch
switch
(
data_type
)
{
case
kNumberTypeInt8
:
return
LayoutTransformInt8
(
src_format
,
dst_format
);
#ifdef ENABLE_FP16
case
kNumberTypeFloat16
:
return
LayoutTransformFp16
(
src_format
,
dst_format
);
#endif
case
kNumberTypeFloat32
:
return
LayoutTransformFp32
(
src_format
,
dst_format
);
default:
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
浏览文件 @
e6157bf1
...
...
@@ -18,7 +18,7 @@
#include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/winograd_transform_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/
base/layout_transform
.h"
#include "src/runtime/kernel/arm/
fp16/layout_transform_fp16
.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
...
...
@@ -159,9 +159,11 @@ void Convolution3x3FP16CPUKernel::ConfigInputOutput() {
auto
output_tensor
=
outputs_
.
at
(
kOutputIndex
);
output_tensor
->
SetFormat
(
schema
::
Format_NHWC
);
auto
input_tensor
=
inputs_
.
at
(
kInputIndex
);
auto
ret
=
CheckLayout
(
input_tensor
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
auto
input_format
=
input_tensor
->
GetFormat
();
schema
::
Format
execute_format
=
schema
::
Format_NHWC4
;
convert_func_
=
LayoutTransformFp16
(
input_format
,
execute_format
);
if
(
convert_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"layout convert func is nullptr."
;
return
;
}
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
浏览文件 @
e6157bf1
...
...
@@ -18,7 +18,7 @@
#include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/
base/layout_transform
.h"
#include "src/runtime/kernel/arm/
fp16/layout_transform_fp16
.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
...
...
@@ -130,9 +130,11 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
void
ConvolutionFP16CPUKernel
::
ConfigInputOutput
()
{
auto
input_tensor
=
inputs_
.
at
(
kInputIndex
);
auto
ret
=
CheckLayout
(
input_tensor
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
auto
input_format
=
input_tensor
->
GetFormat
();
schema
::
Format
execute_format
=
schema
::
Format_NHWC4
;
convert_func_
=
LayoutTransformFp16
(
input_format
,
execute_format
);
if
(
convert_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"layout convert func is nullptr."
;
return
;
}
auto
output_tensor
=
outputs_
.
at
(
kOutputIndex
);
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc
0 → 100644
浏览文件 @
e6157bf1
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "schema/ops_generated.h"
#include "mindspore/core/utils/log_adapter.h"
namespace
mindspore
::
kernel
{
LayoutConvertor
LayoutTransformFp16
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
)
{
if
(
src_format
==
schema
::
Format_NHWC
&&
dst_format
==
schema
::
Format_NC4HW4
)
{
return
PackNHWCToNC4HW4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NHWC
&&
dst_format
==
schema
::
Format_NHWC4
)
{
return
PackNHWCToNHWC4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NC4HW4
&&
dst_format
==
schema
::
Format_NHWC4
)
{
return
PackNC4HW4ToNHWC4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NCHW
&&
dst_format
==
schema
::
Format_NC4HW4
)
{
return
PackNCHWToNC4HW4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NC4HW4
&&
dst_format
==
schema
::
Format_NHWC
)
{
return
PackNC4HW4ToNHWCFp16
;
}
else
{
MS_LOG
(
ERROR
)
<<
"Unsupported transform from "
<<
schema
::
EnumNameFormat
(
src_format
)
<<
" to "
<<
schema
::
EnumNameFormat
(
dst_format
);
return
nullptr
;
}
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h
0 → 100644
浏览文件 @
e6157bf1
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_
#include "src/runtime/kernel/arm/base/layout_transform.h"
#include "schema/ops_generated.h"
namespace
mindspore
::
kernel
{
LayoutConvertor
LayoutTransformFp16
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
);
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
浏览文件 @
e6157bf1
...
...
@@ -19,6 +19,7 @@
#include "src/runtime/kernel/arm/fp32/convolution_3x3.h"
#include "src/runtime/kernel/arm/fp32/convolution_winograd.h"
#include "src/runtime/kernel/arm/opclib/fp32/conv.h"
#include "src/runtime/kernel/arm/opclib/common_func.h"
#include "schema/model_generated.h"
#include "src/kernel_factory.h"
#include "include/errorcode.h"
...
...
@@ -56,7 +57,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
float
));
PackWeightFp32
(
origin_weight
,
conv_param_
,
packed_weight_
);
PackWeightFp32
(
origin_weight
,
conv_param_
,
packed_weight_
,
oc_block
,
oc_block_num
);
// init bias
bias_data_
=
reinterpret_cast
<
float
*>
(
malloc
(
oc_block_num
*
oc_block
*
sizeof
(
float
)));
...
...
@@ -125,6 +126,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() {
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
return
;
}
#ifdef ENABLE_ARM32
gemm_func_
=
IndirectGemmFp32_8x4
;
#else
gemm_func_
=
IndirectGemmFp32_8x8
;
#endif
}
int
ConvolutionCPUKernel
::
Init
()
{
...
...
@@ -175,9 +181,13 @@ int ConvolutionCPUKernel::ReSize() {
}
int
ConvolutionCPUKernel
::
RunImpl
(
int
task_id
)
{
if
(
gemm_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"gemm_func is nullptr."
;
return
RET_ERROR
;
}
auto
output_addr
=
reinterpret_cast
<
float
*>
(
outputs_
.
at
(
kOutputIndex
)
->
Data
());
ConvFp32
(
reinterpret_cast
<
float
*>
(
nhwc4_input_
),
packed_input_
,
packed_weight_
,
reinterpret_cast
<
float
*>
(
bias_data_
),
tmp_output_block_
,
output_addr
,
task_id
,
conv_param_
);
reinterpret_cast
<
float
*>
(
bias_data_
),
tmp_output_block_
,
output_addr
,
task_id
,
conv_param_
,
gemm_func_
);
return
RET_OK
;
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
浏览文件 @
e6157bf1
...
...
@@ -21,6 +21,7 @@
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/opclib/op_base.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/opclib/fp32/conv.h"
namespace
mindspore
::
kernel
{
class
ConvolutionCPUKernel
:
public
ConvolutionBaseCPUKernel
{
...
...
@@ -52,8 +53,8 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
float
*
packed_input_
;
float
*
packed_weight_
;
float
*
tmp_output_block_
;
GEMM_FUNC_FP32
gemm_func_
=
nullptr
;
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
浏览文件 @
e6157bf1
...
...
@@ -29,14 +29,13 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_Conv2D
;
namespace
mindspore
::
kernel
{
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
)
{
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
,
int
oc_block
,
int
oc_block_num
)
{
auto
input_channel
=
conv_param
->
input_channel_
;
auto
output_channel
=
conv_param
->
output_channel_
;
auto
kernel_plane
=
conv_param
->
kernel_w_
*
conv_param
->
kernel_h_
;
int
iC4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
oc8
=
UP_DIV
(
output_channel
,
C8NUM
);
size_t
tmp_size
=
oc
8
*
C8NUM
*
iC4
*
C4NUM
*
kernel_plane
*
sizeof
(
float
);
size_t
tmp_size
=
oc
_block_num
*
oc_block
*
iC4
*
C4NUM
*
kernel_plane
*
sizeof
(
float
);
auto
tmp_addr
=
reinterpret_cast
<
float
*>
(
malloc
(
tmp_size
));
if
(
tmp_addr
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_addr failed."
;
...
...
@@ -45,8 +44,7 @@ void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_
memset
(
tmp_addr
,
0
,
tmp_size
);
PackNHWCToNC4HW4Fp32
(
origin_weight
,
tmp_addr
,
output_channel
,
kernel_plane
,
input_channel
);
Conv3x3Fp32FilterTransform
(
tmp_addr
,
dst_weight
,
iC4
,
output_channel
,
kernel_plane
);
Conv3x3Fp32FilterTransform
(
tmp_addr
,
dst_weight
,
iC4
,
output_channel
,
kernel_plane
,
oc_block
);
free
(
tmp_addr
);
}
...
...
@@ -55,10 +53,17 @@ int Convolution3x3CPUKernel::InitWeightBias() {
auto
output_channel
=
conv_param_
->
output_channel_
;
int
iC4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
oC4
=
UP_DIV
(
output_channel
,
C4NUM
);
int
oC8
=
UP_DIV
(
output_channel
,
C8NUM
);
int
oc_block
,
oc_block_num
;
#ifdef ENABLE_ARM32
oc_block
=
C4NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C4NUM
);
#else
oc_block
=
C8NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C8NUM
);
#endif
int
k_plane
=
16
;
// init weight
size_t
transformed_size
=
iC4
*
C4NUM
*
o
C8
*
C8NUM
*
k_plane
*
sizeof
(
float
);
size_t
transformed_size
=
iC4
*
C4NUM
*
o
c_block_num
*
oc_block
*
k_plane
*
sizeof
(
float
);
transformed_filter_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
transformed_size
));
if
(
transformed_filter_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc transformed filter addr failed."
;
...
...
@@ -66,7 +71,7 @@ int Convolution3x3CPUKernel::InitWeightBias() {
}
memset
(
transformed_filter_addr_
,
0
,
transformed_size
);
auto
weight_data
=
reinterpret_cast
<
float
*>
(
inputs_
.
at
(
kWeightIndex
)
->
Data
());
ProcessFilter
(
weight_data
,
transformed_filter_addr_
,
conv_param_
);
ProcessFilter
(
weight_data
,
transformed_filter_addr_
,
conv_param_
,
oc_block
,
oc_block_num
);
// init bias
size_t
new_bias_size
=
oC4
*
C4NUM
*
sizeof
(
float
);
...
...
@@ -89,7 +94,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
int
iC4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
int
oC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
k_plane
=
16
;
// todo
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
k_plane
*
iC4
*
C4NUM
*
sizeof
(
float
);
tile_buffer_
=
reinterpret_cast
<
float
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
...
...
@@ -148,6 +152,11 @@ void Convolution3x3CPUKernel::ConfigInputOutput() {
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
return
;
}
#ifdef ENABLE_ARM32
gemm_func_
=
IndirectGemmFp32_8x4
;
#else
gemm_func_
=
IndirectGemmFp32_8x8
;
#endif
}
int
Convolution3x3CPUKernel
::
Init
()
{
...
...
@@ -201,9 +210,13 @@ int Convolution3x3CPUKernel::ReSize() {
}
int
Convolution3x3CPUKernel
::
RunImpl
(
int
task_id
)
{
if
(
gemm_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"gemm_func is nullptr."
;
return
RET_ERROR
;
}
auto
output_addr
=
reinterpret_cast
<
float
*>
(
outputs_
.
at
(
kOutputIndex
)
->
Data
());
Conv3x3Fp32
(
reinterpret_cast
<
float
*>
(
nhwc4_input_
),
transformed_filter_addr_
,
reinterpret_cast
<
float
*>
(
bias_data_
),
output_addr
,
tmp_buffer_address_list_
,
task_id
,
conv_param_
);
output_addr
,
tmp_buffer_address_list_
,
task_id
,
conv_param_
,
gemm_func_
);
return
RET_OK
;
}
...
...
@@ -234,4 +247,3 @@ int Convolution3x3CPUKernel::Run() {
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
浏览文件 @
e6157bf1
...
...
@@ -19,7 +19,6 @@
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/opclib/winograd_transform.h"
...
...
@@ -62,9 +61,9 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
float
*
tmp_dst_buffer_
;
float
*
nc4hw4_out_
;
TmpBufferAddress
tmp_buffer_address_list_
[
4
];
GEMM_FUNC_FP32
gemm_func_
=
nullptr
;
};
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
);
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
,
int
oc_block
,
int
oc_block_num
);
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_3X3_H_
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
浏览文件 @
e6157bf1
...
...
@@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_Conv2D;
namespace
mindspore
::
kernel
{
void
WinogradFilterTransform
(
const
float
*
weight_data
,
Matrix
*
trans_weight
,
int
kernel_unit
,
int
input_unit
,
ConvParameter
*
conv_param
)
{
ConvParameter
*
conv_param
,
int
oc_block
)
{
// original weight format : ohwi
auto
channel_in
=
conv_param
->
input_channel_
;
auto
channel_out
=
conv_param
->
output_channel_
;
...
...
@@ -53,10 +53,10 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
int
kernel_plane_stride
=
channel_in
;
for
(
int
i
=
0
;
i
<
channel_out
;
i
++
)
{
int
o
c8_block
=
i
/
C8NUM
;
int
o
c8_res
=
i
%
C8NUM
;
int
o
ut_c_block
=
i
/
oc_block
;
int
o
ut_c_res
=
i
%
oc_block
;
int
input_oz_offset
=
i
*
kernel_unit
*
kernel_unit
*
channel_in
;
int
output_oz_offset
=
o
c8_block
*
strides
[
1
]
*
input_unit
*
input_unit
+
oc8
_res
;
int
output_oz_offset
=
o
ut_c_block
*
strides
[
1
]
*
input_unit
*
input_unit
+
out_c
_res
;
for
(
int
j
=
0
;
j
<
channel_in
;
j
++
)
{
int
ic4_block
=
j
/
C4NUM
;
int
ic4_res
=
j
%
C4NUM
;
...
...
@@ -88,16 +88,24 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
int
ConvolutionWinogradCPUKernel
::
InitWeightBias
()
{
int
output_channel
=
conv_param_
->
output_channel_
;
int
oc4
=
UP_DIV
(
output_channel
,
C4NUM
);
int
oc_block
,
oc_block_num
;
#ifdef ENABLE_ARM32
oc_block
=
C4NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C4NUM
);
#else
oc_block
=
C8NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C8NUM
);
#endif
// init weight
auto
ret
=
MallocFilterMatrix
();
auto
ret
=
MallocFilterMatrix
(
oc_block
,
oc_block_num
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Malloc filter matrix failed."
;
return
RET_ERROR
;
}
auto
weight_tensor
=
inputs_
.
at
(
kWeightIndex
);
auto
weight_data
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
WinogradFilterTransform
(
weight_data
,
trans_weight_
,
kernel_unit_
,
input_unit_
,
conv_param_
);
WinogradFilterTransform
(
weight_data
,
trans_weight_
,
kernel_unit_
,
input_unit_
,
conv_param_
,
oc_block
);
// init bias
size_t
new_bias_size
=
oc4
*
C4NUM
*
sizeof
(
float
);
...
...
@@ -112,14 +120,12 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
return
RET_OK
;
}
int
ConvolutionWinogradCPUKernel
::
MallocFilterMatrix
()
{
int
ConvolutionWinogradCPUKernel
::
MallocFilterMatrix
(
int
oc_block
,
int
oc_block_num
)
{
int
channel_in
=
conv_param_
->
input_channel_
;
int
channel_out
=
conv_param_
->
output_channel_
;
int
ic4
=
UP_DIV
(
channel_in
,
BLOCK
);
int
oc8
=
UP_DIV
(
channel_out
,
C8NUM
);
// set data
auto
trans_matrix_data_size
=
input_unit_
*
input_unit_
*
ic4
*
oc8
*
C4NUM
*
C8NUM
*
sizeof
(
float
);
auto
trans_matrix_data_size
=
input_unit_
*
input_unit_
*
ic4
*
C4NUM
*
oc_block_num
*
oc_block
*
sizeof
(
float
);
auto
matrix_buffer
=
malloc
(
trans_matrix_data_size
);
if
(
matrix_buffer
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc matrix_buffer failed."
;
...
...
@@ -134,10 +140,10 @@ int ConvolutionWinogradCPUKernel::MallocFilterMatrix() {
std
::
vector
<
int
>
strides
;
// set shape
shapes
.
push_back
(
input_unit_
*
input_unit_
);
shapes
.
push_back
(
oc
8
);
shapes
.
push_back
(
oc
_block_num
);
shapes
.
push_back
(
ic4
);
shapes
.
push_back
(
C4NUM
);
shapes
.
push_back
(
C8NUM
);
shapes
.
push_back
(
oc_block
);
// set stride
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
int
stride
=
1
;
...
...
@@ -227,6 +233,11 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() {
MS_LOG
(
ERROR
)
<<
"Get output_trans_func_ failed."
;
return
RET_ERROR
;
}
#ifdef ENABLE_ARM32
gemm_func_
=
IndirectGemmFp32_8x4
;
#else
gemm_func_
=
IndirectGemmFp32_8x8
;
#endif
return
RET_OK
;
}
...
...
@@ -301,10 +312,14 @@ int ConvolutionWinogradCPUKernel::ReSize() {
}
int
ConvolutionWinogradCPUKernel
::
RunImpl
(
int
task_id
)
{
if
(
gemm_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"gemm_func is nullptr."
;
return
RET_ERROR
;
}
auto
output_addr
=
reinterpret_cast
<
float
*>
(
outputs_
.
at
(
kOutputIndex
)
->
Data
());
ConvWinogardFp32
(
reinterpret_cast
<
float
*>
(
nhwc4_input_
),
reinterpret_cast
<
float
*>
(
trans_weight_
->
GetData
()),
reinterpret_cast
<
const
float
*>
(
bias_data_
),
output_addr
,
tmp_buffer_address_list_
,
task_id
,
conv_param_
,
input_trans_func_
,
output_trans_func_
);
conv_param_
,
input_trans_func_
,
output_trans_func_
,
gemm_func_
);
return
RET_OK
;
}
...
...
@@ -335,4 +350,3 @@ int ConvolutionWinogradCPUKernel::Run() {
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
浏览文件 @
e6157bf1
...
...
@@ -50,7 +50,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int
Run
()
override
;
int
RunImpl
(
int
task_id
);
int
InitWeightBias
();
int
MallocFilterMatrix
();
int
MallocFilterMatrix
(
int
oc_block
,
int
oc_block_num
);
int
InitTmpBuffer
();
int
ConfigInputOutput
();
...
...
@@ -66,9 +66,9 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
InputTransformUnitFunc
input_trans_func_
;
OutputTransformUnitFunc
output_trans_func_
;
TmpBufferAddress
tmp_buffer_address_list_
[
5
];
GEMM_FUNC_FP32
gemm_func_
=
nullptr
;
};
void
WinogradFilterTransform
(
const
float
*
weight_data
,
Matrix
*
trans_weight
,
int
kernel_unit
,
int
input_unit
,
ConvParameter
*
conv_param
);
ConvParameter
*
conv_param
,
int
oc_block
);
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
浏览文件 @
e6157bf1
...
...
@@ -17,7 +17,7 @@
#include "src/runtime/kernel/arm/opclib/common_func.h"
#include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
#ifndef
__aarch64__
#ifndef
ENABLE_ARM64
void
IndirectGemmFp32
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
int
ic4
,
int
output_channel
,
size_t
offset
,
size_t
relu
,
size_t
relu6
)
{
for
(
int
i
=
0
;
i
<
TILE_NUM
;
i
++
)
{
...
...
@@ -102,6 +102,11 @@ void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight
}
}
#endif
#ifndef ENABLE_ARM32
void
IndirectGemmFp32_8x4
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
)
{}
#endif
int8_t
MinInt8
(
int8_t
a
,
int8_t
b
)
{
return
b
^
((
a
^
b
)
&
-
(
a
<
b
));
}
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
浏览文件 @
e6157bf1
...
...
@@ -36,6 +36,9 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane
void
IndirectGemmFp32_8x8
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
);
void
IndirectGemmFp32_8x4
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
);
void
IndirectGemmFp32_Comm
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
size_t
ic4
,
size_t
hw
,
size_t
oc
,
size_t
offset
);
void
IndirectGemmFp32
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
int
ic4
,
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc
浏览文件 @
e6157bf1
...
...
@@ -20,7 +20,8 @@
// fp32 conv common
void
ConvFp32
(
float
*
input_data
,
float
*
packed_input
,
float
*
packed_weight
,
const
float
*
bias_data
,
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
)
{
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
)
{
int
kernel_h
=
conv_param
->
kernel_h_
;
int
kernel_w
=
conv_param
->
kernel_w_
;
int
in_batch
=
conv_param
->
input_batch_
;
...
...
@@ -57,12 +58,12 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
int
out_offset
=
thread_id
*
TILE_NUM
*
out_channel
+
out_batch_offset
;
if
(
real_cal_num
==
TILE_NUM
)
{
float
*
gemm_output
=
output_data
+
out_offset
;
IndirectGemmFp32_8x8
(
gemm_output
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
gemm_func
(
gemm_output
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
}
else
{
// res part
IndirectGemmFp32_8x8
(
tmp_out_block
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
gemm_func
(
tmp_out_block
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
memcpy
(
output_data
+
out_offset
,
tmp_out_block
,
real_cal_num
*
out_channel
*
sizeof
(
float
));
}
}
...
...
@@ -78,7 +79,8 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output
// fp32 conv winograd
void
ConvWinogardFp32
(
float
*
input_data
,
float
*
trans_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
)
{
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
,
GEMM_FUNC_FP32
gemm_func
)
{
int
thread_num
=
conv_param
->
thread_num_
;
int
input_unit
=
conv_param
->
input_unit_
;
int
in_batch
=
conv_param
->
input_batch_
;
...
...
@@ -111,8 +113,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
WinogradInputTransform
(
input_data
,
trans_input
,
tmp_data
,
cal_num
,
out_tile_index
,
out_w_block
,
conv_param
,
input_trans_func
);
// step 3 : gemm
IndirectGemmFp32_8x8
(
gemm_out
,
trans_input
,
trans_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
output_offset
,
1
,
1
,
0
,
0
);
gemm_func
(
gemm_out
,
trans_input
,
trans_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
output_offset
,
1
,
1
,
0
,
0
);
// step 4 : output transform
WinogradOutputTransform
(
gemm_out
,
tmp_out_data
,
bias_data
,
cal_num
,
out_tile_index
,
out_w_block
,
conv_param
,
...
...
@@ -173,7 +175,7 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i
// fp32 conv3x3
void
Conv3x3Fp32
(
float
*
input_data
,
float
*
transed_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
)
{
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
)
{
int
thread_count
=
conv_param
->
thread_num_
;
int
ic4
=
UP_DIV
(
conv_param
->
input_channel_
,
C4NUM
);
int
output_channel
=
conv_param
->
output_channel_
;
...
...
@@ -198,8 +200,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
Conv3x3Fp32InputTransform
(
input_data
,
tile_buffer
,
block_unit_buffer
,
start_index
,
real_cal_num
,
out_w_block
,
conv_param
);
IndirectGemmFp32_8x8
(
tmp_dst_buffer
,
tile_buffer
,
transed_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
oc4
*
C4NUM
*
input_unit_square
*
sizeof
(
float
),
1
,
1
,
0
,
0
);
gemm_func
(
tmp_dst_buffer
,
tile_buffer
,
transed_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
oc4
*
C4NUM
*
input_unit_square
*
sizeof
(
float
),
1
,
1
,
0
,
0
);
Conv3x3Fp32OutputTransform
(
tmp_dst_buffer
,
nc4hw4_out
,
bias_data
,
start_index
,
real_cal_num
,
out_w_block
,
conv_param
);
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h
浏览文件 @
e6157bf1
...
...
@@ -28,10 +28,14 @@
#include "src/runtime/kernel/arm/opclib/winograd_utils.h"
using
TmpBufferAddress
=
float
*
;
typedef
void
(
*
GEMM_FUNC_FP32
)(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
);
// fp32 convolution common (im2col+gemm)
void
ConvFp32
(
float
*
input_data
,
float
*
packed_input
,
float
*
packed_weight
,
const
float
*
bias_data
,
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
);
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
);
// fp32 conv1x1 strassen matmul
int
Conv1x1Fp32
(
const
float
*
input_data
,
const
float
*
weight_data
,
float
*
output_data
,
float
*
tmp_ptr
,
...
...
@@ -40,12 +44,13 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output
// fp32 convolution winograd
void
ConvWinogardFp32
(
float
*
input_data
,
float
*
trans_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
);
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
,
GEMM_FUNC_FP32
gemm_func
);
void
UnPackWinogradOutput
(
const
float
*
src
,
float
*
dst
,
int
batch
,
int
height
,
int
width
,
int
channel
,
int
output_unit
);
// fp32 conv3x3
void
Conv3x3Fp32
(
float
*
input_data
,
float
*
transed_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
);
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_H_
mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
浏览文件 @
e6157bf1
...
...
@@ -49,7 +49,9 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in
#ifdef __aarch64__
IndirectGemmInt8_4x4
(
dst
,
src
,
weight
,
bias
,
kernel_plane
,
ic4
,
output_channel
,
output_channel
*
sizeof
(
int8_t
),
input_sum
,
act_min
,
act_max
,
out_zp
,
out_multiplier
,
shift_before
,
shift_after
);
// todo arm32
#elif defined(ENABLE_ARM32)
IndirectGemmInt8_2x4
(
dst
,
src
,
weight
,
bias
,
kernel_plane
,
ic4
,
output_channel
,
output_channel
*
sizeof
(
int8_t
),
input_sum
,
act_min
,
act_max
,
out_zp
,
out_multiplier
,
shift_before
,
shift_after
);
#else
int
tile_num
=
conv_param
->
tile_num_
;
int
plane_c4
=
UP_DIV
(
kernel_plane
,
C4NUM
);
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h
浏览文件 @
e6157bf1
...
...
@@ -58,10 +58,10 @@ class OptimizeModule {
if
((
!
support_optimize_ops
)
&&
(
!
support_fp16
))
{
return
;
}
//
optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY);
//
if (optimized_op_handler_ == nullptr) {
//
printf("Open optimize shared library failed.\n");
//
}
optimized_op_handler_
=
dlopen
(
OPTIMIZE_SHARED_LIBRARY_PATH
,
RTLD_LAZY
);
if
(
optimized_op_handler_
==
nullptr
)
{
printf
(
"Open optimize shared library failed.
\n
"
);
}
}
~
OptimizeModule
()
=
default
;
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
浏览文件 @
e6157bf1
...
...
@@ -18,20 +18,19 @@
#include <cstring>
#include <cstdlib>
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
)
{
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
,
int
oc_block
,
int
oc_block_num
)
{
// original weight format : ohwi
// todo pack weight for arm32 platform
int
kernel_h
=
conv_param
->
kernel_h_
;
int
kernel_w
=
conv_param
->
kernel_w_
;
int
in_channel
=
conv_param
->
input_channel_
;
int
out_channel
=
conv_param
->
output_channel_
;
int
oc8
=
UP_DIV
(
out_channel
,
C8NUM
);
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
kernel_plane
=
kernel_h
*
kernel_w
;
int
pack_weight_size
=
oc
8
*
C8NUM
*
ic4
*
C4NUM
*
kernel_plane
;
int
pack_weight_size
=
oc
_block
*
oc_block_num
*
ic4
*
C4NUM
*
kernel_plane
;
int
unit_size
=
C8NUM
*
C4NUM
;
int
block_size
=
pack_weight_size
/
oc
8
;
int
unit_size
=
oc_block
*
C4NUM
;
int
block_size
=
pack_weight_size
/
oc
_block_num
;
for
(
int
m
=
0
;
m
<
kernel_plane
;
m
++
)
{
int
kernel_plane_stride
=
m
*
in_channel
;
...
...
@@ -43,12 +42,12 @@ void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed
int
real_ic_num
=
ic_remainder
<
C4NUM
?
ic_remainder
:
C4NUM
;
for
(
int
h
=
0
;
h
<
real_ic_num
;
h
++
)
{
int
block_stride
=
channel_block_stride
+
h
;
int
packed_block_stride
=
packed_channel_block_size
+
h
*
C8NUM
;
for
(
int
j
=
0
;
j
<
oc
8
;
j
++
)
{
int
kernel_block_stride
=
block_stride
+
j
*
C8NUM
*
kernel_plane
*
in_channel
;
int
packed_block_stride
=
packed_channel_block_size
+
h
*
oc_block
;
for
(
int
j
=
0
;
j
<
oc
_block_num
;
j
++
)
{
int
kernel_block_stride
=
block_stride
+
j
*
oc_block
*
kernel_plane
*
in_channel
;
int
packed_kernel_block_size
=
packed_block_stride
+
j
*
block_size
;
int
oc_remainder
=
out_channel
-
j
*
C8NUM
;
int
real_oc_num
=
oc_remainder
<
C8NUM
?
oc_remainder
:
C8NUM
;
int
oc_remainder
=
out_channel
-
j
*
oc_block
;
int
real_oc_num
=
oc_remainder
<
oc_block
?
oc_remainder
:
oc_block
;
for
(
int
k
=
0
;
k
<
real_oc_num
;
k
++
)
{
float
*
origin_data_ptr
=
weight_data
+
kernel_block_stride
+
k
*
kernel_plane
*
in_channel
;
float
*
packed_data_ptr
=
packed_weight
+
packed_kernel_block_size
+
k
;
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
浏览文件 @
e6157bf1
...
...
@@ -40,7 +40,8 @@ void MatrixPack(const float *src, float *dst, int row, int ic4, int stride);
void
PackInputToC8Int8
(
const
int8_t
*
input_data
,
int16_t
*
packed_input
,
ConvParameter
*
conv_param
);
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
);
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
,
int
oc_block
,
int
oc_block_num
);
void
PackWeightInt8
(
int8_t
*
weight_data
,
ConvParameter
*
conv_param
,
int8_t
*
packed_weight
,
int32_t
*
weight_sum
);
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc
浏览文件 @
e6157bf1
...
...
@@ -326,18 +326,18 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
}
}
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
)
{
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
,
int
oc_block
)
{
int
input_unit
=
4
;
int
dst_step
=
iC4
*
C4NUM
*
C8NUM
;
int
dst_step
=
iC4
*
C4NUM
*
oc_block
;
for
(
int
o
=
0
;
o
<
output_channel
;
o
++
)
{
int
oc
8_block_num
=
o
/
C8NUM
;
int
oc
8_block_rem
=
o
%
C8NUM
;
int
oc
_block_num
=
o
/
oc_block
;
int
oc
_block_rem
=
o
%
oc_block
;
int
src_oc_offset
=
o
*
iC4
*
C4NUM
*
kernel_plane
;
int
dst_oc_offset
=
oc
8_block_num
*
C8NUM
*
iC4
*
C4NUM
*
input_unit
*
input_unit
+
oc8
_block_rem
;
int
dst_oc_offset
=
oc
_block_num
*
oc_block
*
iC4
*
C4NUM
*
input_unit
*
input_unit
+
oc
_block_rem
;
for
(
int
i
=
0
;
i
<
iC4
;
i
++
)
{
float
*
src_ic4_ptr
=
weight_data
+
src_oc_offset
+
i
*
kernel_plane
*
C4NUM
;
float
*
dst_ic4_ptr
=
trans_weight
+
dst_oc_offset
+
i
*
C8NUM
*
C4NUM
;
float
*
dst_ic4_ptr
=
trans_weight
+
dst_oc_offset
+
i
*
oc_block
*
C4NUM
;
#ifdef ENABLE_ARM
float32x4_t
g00
=
vld1q_f32
(
src_ic4_ptr
);
float32x4_t
g01
=
vld1q_f32
(
src_ic4_ptr
+
4
);
...
...
@@ -1368,4 +1368,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons
}
}
}
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h
浏览文件 @
e6157bf1
...
...
@@ -43,7 +43,8 @@ void Conv3x3Fp32InputUnit(const float *tmp_data, float *trans_input_data, size_t
void
Conv3x3Fp32InputTransform
(
const
float
*
input_data
,
float
*
trans_input
,
float
*
tmp_data
,
int
start_index
,
int
real_cal_num
,
int
out_w_block
,
ConvParameter
*
conv_param
);
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
);
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
,
int
oc_block
);
void
Conv3x3Fp32OutputUnit
(
const
float
*
gemm_out
,
const
float
*
bias_data
,
float
*
output_data
,
bool
h_not_bound
,
bool
w_not_bound
,
int
output_w
);
...
...
@@ -67,4 +68,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons
int
real_cal_num
,
int
out_w_block
,
ConvParameter
*
conv_param
);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_WINOGRAD_TRANSFORM_H_
mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
浏览文件 @
e6157bf1
...
...
@@ -122,7 +122,7 @@ TEST_F(TestPack, PackWeightFp32) {
std
::
string
weight_path
=
"./test_data/conv/convfp32_weight_32_3_3_3.bin"
;
auto
weight_data
=
reinterpret_cast
<
float
*>
(
mindspore
::
lite
::
ReadFile
(
weight_path
.
c_str
(),
&
weight_size
));
auto
packed_weight
=
reinterpret_cast
<
float
*>
(
malloc
(
k_h
*
k_w
*
ic4
*
C4NUM
*
oc8
*
C8NUM
*
sizeof
(
float
)));
PackWeightFp32
(
weight_data
,
conv_param
,
packed_weight
);
PackWeightFp32
(
weight_data
,
conv_param
,
packed_weight
,
C8NUM
,
oc8
);
printf
(
"==================output data=================
\n
"
);
for
(
int
i
=
0
;
i
<
20
;
i
++
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录