Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
4edcce36
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4edcce36
编写于
8月 03, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 03, 2020
浏览文件
操作
浏览文件
下载
差异文件
!3864 fix fp16 init func bug && enable arm32 assembly code
Merge pull request !3864 from fuzhiye/mindspore
上级
25fdb67e
e6157bf1
变更
22
隐藏空白更改
内联
并排
Showing
22 changed file
with
207 addition
and
95 deletion
+207
-95
mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc
...pore/lite/src/runtime/kernel/arm/base/layout_transform.cc
+0
-10
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
.../lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+6
-4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
...pore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+6
-4
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc
...lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc
+39
-0
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h
.../lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h
+27
-0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+12
-2
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
+2
-1
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
...spore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+23
-11
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+2
-3
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
.../lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+28
-14
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
...e/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+3
-3
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
+6
-1
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
+3
-0
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc
+13
-11
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h
+8
-3
mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
...pore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
+3
-1
mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h
...ore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h
+4
-4
mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
+10
-11
mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
+2
-1
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc
.../lite/src/runtime/kernel/arm/opclib/winograd_transform.cc
+7
-8
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h
...e/lite/src/runtime/kernel/arm/opclib/winograd_transform.h
+2
-2
mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
.../lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
+1
-1
未找到文件。
mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc
浏览文件 @
4edcce36
...
...
@@ -19,12 +19,6 @@
using
mindspore
::
schema
::
Format
;
namespace
mindspore
::
kernel
{
#ifdef ENABLE_FP16
LayoutConvertor
LayoutTransformFp16
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
)
{
// todo
return
nullptr
;
}
#endif
LayoutConvertor
LayoutTransformFp32
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
)
{
// todo
if
(
src_format
==
schema
::
Format_NHWC
&&
dst_format
==
schema
::
Format_NC4HW4
)
{
...
...
@@ -58,10 +52,6 @@ LayoutConvertor LayoutTransform(TypeId data_type, schema::Format src_format, sch
switch
(
data_type
)
{
case
kNumberTypeInt8
:
return
LayoutTransformInt8
(
src_format
,
dst_format
);
#ifdef ENABLE_FP16
case
kNumberTypeFloat16
:
return
LayoutTransformFp16
(
src_format
,
dst_format
);
#endif
case
kNumberTypeFloat32
:
return
LayoutTransformFp32
(
src_format
,
dst_format
);
default:
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
浏览文件 @
4edcce36
...
...
@@ -18,7 +18,7 @@
#include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/winograd_transform_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/
base/layout_transform
.h"
#include "src/runtime/kernel/arm/
fp16/layout_transform_fp16
.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
...
...
@@ -159,9 +159,11 @@ void Convolution3x3FP16CPUKernel::ConfigInputOutput() {
auto
output_tensor
=
outputs_
.
at
(
kOutputIndex
);
output_tensor
->
SetFormat
(
schema
::
Format_NHWC
);
auto
input_tensor
=
inputs_
.
at
(
kInputIndex
);
auto
ret
=
CheckLayout
(
input_tensor
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
auto
input_format
=
input_tensor
->
GetFormat
();
schema
::
Format
execute_format
=
schema
::
Format_NHWC4
;
convert_func_
=
LayoutTransformFp16
(
input_format
,
execute_format
);
if
(
convert_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"layout convert func is nullptr."
;
return
;
}
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
浏览文件 @
4edcce36
...
...
@@ -18,7 +18,7 @@
#include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/
base/layout_transform
.h"
#include "src/runtime/kernel/arm/
fp16/layout_transform_fp16
.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
...
...
@@ -130,9 +130,11 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
void
ConvolutionFP16CPUKernel
::
ConfigInputOutput
()
{
auto
input_tensor
=
inputs_
.
at
(
kInputIndex
);
auto
ret
=
CheckLayout
(
input_tensor
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
auto
input_format
=
input_tensor
->
GetFormat
();
schema
::
Format
execute_format
=
schema
::
Format_NHWC4
;
convert_func_
=
LayoutTransformFp16
(
input_format
,
execute_format
);
if
(
convert_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"layout convert func is nullptr."
;
return
;
}
auto
output_tensor
=
outputs_
.
at
(
kOutputIndex
);
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc
0 → 100644
浏览文件 @
4edcce36
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "schema/ops_generated.h"
#include "mindspore/core/utils/log_adapter.h"
namespace
mindspore
::
kernel
{
LayoutConvertor
LayoutTransformFp16
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
)
{
if
(
src_format
==
schema
::
Format_NHWC
&&
dst_format
==
schema
::
Format_NC4HW4
)
{
return
PackNHWCToNC4HW4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NHWC
&&
dst_format
==
schema
::
Format_NHWC4
)
{
return
PackNHWCToNHWC4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NC4HW4
&&
dst_format
==
schema
::
Format_NHWC4
)
{
return
PackNC4HW4ToNHWC4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NCHW
&&
dst_format
==
schema
::
Format_NC4HW4
)
{
return
PackNCHWToNC4HW4Fp16
;
}
else
if
(
src_format
==
schema
::
Format_NC4HW4
&&
dst_format
==
schema
::
Format_NHWC
)
{
return
PackNC4HW4ToNHWCFp16
;
}
else
{
MS_LOG
(
ERROR
)
<<
"Unsupported transform from "
<<
schema
::
EnumNameFormat
(
src_format
)
<<
" to "
<<
schema
::
EnumNameFormat
(
dst_format
);
return
nullptr
;
}
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h
0 → 100644
浏览文件 @
4edcce36
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_
#include "src/runtime/kernel/arm/base/layout_transform.h"
#include "schema/ops_generated.h"
namespace
mindspore
::
kernel
{
LayoutConvertor
LayoutTransformFp16
(
schema
::
Format
src_format
,
schema
::
Format
dst_format
);
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
浏览文件 @
4edcce36
...
...
@@ -19,6 +19,7 @@
#include "src/runtime/kernel/arm/fp32/convolution_3x3.h"
#include "src/runtime/kernel/arm/fp32/convolution_winograd.h"
#include "src/runtime/kernel/arm/opclib/fp32/conv.h"
#include "src/runtime/kernel/arm/opclib/common_func.h"
#include "schema/model_generated.h"
#include "src/kernel_factory.h"
#include "include/errorcode.h"
...
...
@@ -56,7 +57,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
return
RET_ERROR
;
}
memset
(
packed_weight_
,
0
,
pack_weight_size
*
sizeof
(
float
));
PackWeightFp32
(
origin_weight
,
conv_param_
,
packed_weight_
);
PackWeightFp32
(
origin_weight
,
conv_param_
,
packed_weight_
,
oc_block
,
oc_block_num
);
// init bias
bias_data_
=
reinterpret_cast
<
float
*>
(
malloc
(
oc_block_num
*
oc_block
*
sizeof
(
float
)));
...
...
@@ -125,6 +126,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() {
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
return
;
}
#ifdef ENABLE_ARM32
gemm_func_
=
IndirectGemmFp32_8x4
;
#else
gemm_func_
=
IndirectGemmFp32_8x8
;
#endif
}
int
ConvolutionCPUKernel
::
Init
()
{
...
...
@@ -175,9 +181,13 @@ int ConvolutionCPUKernel::ReSize() {
}
int
ConvolutionCPUKernel
::
RunImpl
(
int
task_id
)
{
if
(
gemm_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"gemm_func is nullptr."
;
return
RET_ERROR
;
}
auto
output_addr
=
reinterpret_cast
<
float
*>
(
outputs_
.
at
(
kOutputIndex
)
->
Data
());
ConvFp32
(
reinterpret_cast
<
float
*>
(
nhwc4_input_
),
packed_input_
,
packed_weight_
,
reinterpret_cast
<
float
*>
(
bias_data_
),
tmp_output_block_
,
output_addr
,
task_id
,
conv_param_
);
reinterpret_cast
<
float
*>
(
bias_data_
),
tmp_output_block_
,
output_addr
,
task_id
,
conv_param_
,
gemm_func_
);
return
RET_OK
;
}
...
...
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
浏览文件 @
4edcce36
...
...
@@ -21,6 +21,7 @@
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/opclib/op_base.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/opclib/fp32/conv.h"
namespace
mindspore
::
kernel
{
class
ConvolutionCPUKernel
:
public
ConvolutionBaseCPUKernel
{
...
...
@@ -52,8 +53,8 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
float
*
packed_input_
;
float
*
packed_weight_
;
float
*
tmp_output_block_
;
GEMM_FUNC_FP32
gemm_func_
=
nullptr
;
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
浏览文件 @
4edcce36
...
...
@@ -29,14 +29,13 @@ using mindspore::lite::RET_OK;
using
mindspore
::
schema
::
PrimitiveType_Conv2D
;
namespace
mindspore
::
kernel
{
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
)
{
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
,
int
oc_block
,
int
oc_block_num
)
{
auto
input_channel
=
conv_param
->
input_channel_
;
auto
output_channel
=
conv_param
->
output_channel_
;
auto
kernel_plane
=
conv_param
->
kernel_w_
*
conv_param
->
kernel_h_
;
int
iC4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
oc8
=
UP_DIV
(
output_channel
,
C8NUM
);
size_t
tmp_size
=
oc
8
*
C8NUM
*
iC4
*
C4NUM
*
kernel_plane
*
sizeof
(
float
);
size_t
tmp_size
=
oc
_block_num
*
oc_block
*
iC4
*
C4NUM
*
kernel_plane
*
sizeof
(
float
);
auto
tmp_addr
=
reinterpret_cast
<
float
*>
(
malloc
(
tmp_size
));
if
(
tmp_addr
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_addr failed."
;
...
...
@@ -45,8 +44,7 @@ void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_
memset
(
tmp_addr
,
0
,
tmp_size
);
PackNHWCToNC4HW4Fp32
(
origin_weight
,
tmp_addr
,
output_channel
,
kernel_plane
,
input_channel
);
Conv3x3Fp32FilterTransform
(
tmp_addr
,
dst_weight
,
iC4
,
output_channel
,
kernel_plane
);
Conv3x3Fp32FilterTransform
(
tmp_addr
,
dst_weight
,
iC4
,
output_channel
,
kernel_plane
,
oc_block
);
free
(
tmp_addr
);
}
...
...
@@ -55,10 +53,17 @@ int Convolution3x3CPUKernel::InitWeightBias() {
auto
output_channel
=
conv_param_
->
output_channel_
;
int
iC4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
oC4
=
UP_DIV
(
output_channel
,
C4NUM
);
int
oC8
=
UP_DIV
(
output_channel
,
C8NUM
);
int
oc_block
,
oc_block_num
;
#ifdef ENABLE_ARM32
oc_block
=
C4NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C4NUM
);
#else
oc_block
=
C8NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C8NUM
);
#endif
int
k_plane
=
16
;
// init weight
size_t
transformed_size
=
iC4
*
C4NUM
*
o
C8
*
C8NUM
*
k_plane
*
sizeof
(
float
);
size_t
transformed_size
=
iC4
*
C4NUM
*
o
c_block_num
*
oc_block
*
k_plane
*
sizeof
(
float
);
transformed_filter_addr_
=
reinterpret_cast
<
float
*>
(
malloc
(
transformed_size
));
if
(
transformed_filter_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc transformed filter addr failed."
;
...
...
@@ -66,7 +71,7 @@ int Convolution3x3CPUKernel::InitWeightBias() {
}
memset
(
transformed_filter_addr_
,
0
,
transformed_size
);
auto
weight_data
=
reinterpret_cast
<
float
*>
(
inputs_
.
at
(
kWeightIndex
)
->
Data
());
ProcessFilter
(
weight_data
,
transformed_filter_addr_
,
conv_param_
);
ProcessFilter
(
weight_data
,
transformed_filter_addr_
,
conv_param_
,
oc_block
,
oc_block_num
);
// init bias
size_t
new_bias_size
=
oC4
*
C4NUM
*
sizeof
(
float
);
...
...
@@ -89,7 +94,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
int
iC4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4NUM
);
int
oC4
=
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
int
k_plane
=
16
;
// todo
size_t
tile_buffer_size
=
thread_count_
*
TILE_NUM
*
k_plane
*
iC4
*
C4NUM
*
sizeof
(
float
);
tile_buffer_
=
reinterpret_cast
<
float
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
...
...
@@ -148,6 +152,11 @@ void Convolution3x3CPUKernel::ConfigInputOutput() {
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
return
;
}
#ifdef ENABLE_ARM32
gemm_func_
=
IndirectGemmFp32_8x4
;
#else
gemm_func_
=
IndirectGemmFp32_8x8
;
#endif
}
int
Convolution3x3CPUKernel
::
Init
()
{
...
...
@@ -201,9 +210,13 @@ int Convolution3x3CPUKernel::ReSize() {
}
int
Convolution3x3CPUKernel
::
RunImpl
(
int
task_id
)
{
if
(
gemm_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"gemm_func is nullptr."
;
return
RET_ERROR
;
}
auto
output_addr
=
reinterpret_cast
<
float
*>
(
outputs_
.
at
(
kOutputIndex
)
->
Data
());
Conv3x3Fp32
(
reinterpret_cast
<
float
*>
(
nhwc4_input_
),
transformed_filter_addr_
,
reinterpret_cast
<
float
*>
(
bias_data_
),
output_addr
,
tmp_buffer_address_list_
,
task_id
,
conv_param_
);
output_addr
,
tmp_buffer_address_list_
,
task_id
,
conv_param_
,
gemm_func_
);
return
RET_OK
;
}
...
...
@@ -234,4 +247,3 @@ int Convolution3x3CPUKernel::Run() {
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
浏览文件 @
4edcce36
...
...
@@ -19,7 +19,6 @@
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/opclib/winograd_transform.h"
...
...
@@ -62,9 +61,9 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
float
*
tmp_dst_buffer_
;
float
*
nc4hw4_out_
;
TmpBufferAddress
tmp_buffer_address_list_
[
4
];
GEMM_FUNC_FP32
gemm_func_
=
nullptr
;
};
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
);
void
ProcessFilter
(
float
*
origin_weight
,
float
*
dst_weight
,
ConvParameter
*
conv_param
,
int
oc_block
,
int
oc_block_num
);
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_3X3_H_
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
浏览文件 @
4edcce36
...
...
@@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_Conv2D;
namespace
mindspore
::
kernel
{
void
WinogradFilterTransform
(
const
float
*
weight_data
,
Matrix
*
trans_weight
,
int
kernel_unit
,
int
input_unit
,
ConvParameter
*
conv_param
)
{
ConvParameter
*
conv_param
,
int
oc_block
)
{
// original weight format : ohwi
auto
channel_in
=
conv_param
->
input_channel_
;
auto
channel_out
=
conv_param
->
output_channel_
;
...
...
@@ -53,10 +53,10 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
int
kernel_plane_stride
=
channel_in
;
for
(
int
i
=
0
;
i
<
channel_out
;
i
++
)
{
int
o
c8_block
=
i
/
C8NUM
;
int
o
c8_res
=
i
%
C8NUM
;
int
o
ut_c_block
=
i
/
oc_block
;
int
o
ut_c_res
=
i
%
oc_block
;
int
input_oz_offset
=
i
*
kernel_unit
*
kernel_unit
*
channel_in
;
int
output_oz_offset
=
o
c8_block
*
strides
[
1
]
*
input_unit
*
input_unit
+
oc8
_res
;
int
output_oz_offset
=
o
ut_c_block
*
strides
[
1
]
*
input_unit
*
input_unit
+
out_c
_res
;
for
(
int
j
=
0
;
j
<
channel_in
;
j
++
)
{
int
ic4_block
=
j
/
C4NUM
;
int
ic4_res
=
j
%
C4NUM
;
...
...
@@ -88,16 +88,24 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
int
ConvolutionWinogradCPUKernel
::
InitWeightBias
()
{
int
output_channel
=
conv_param_
->
output_channel_
;
int
oc4
=
UP_DIV
(
output_channel
,
C4NUM
);
int
oc_block
,
oc_block_num
;
#ifdef ENABLE_ARM32
oc_block
=
C4NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C4NUM
);
#else
oc_block
=
C8NUM
;
oc_block_num
=
UP_DIV
(
output_channel
,
C8NUM
);
#endif
// init weight
auto
ret
=
MallocFilterMatrix
();
auto
ret
=
MallocFilterMatrix
(
oc_block
,
oc_block_num
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Malloc filter matrix failed."
;
return
RET_ERROR
;
}
auto
weight_tensor
=
inputs_
.
at
(
kWeightIndex
);
auto
weight_data
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
WinogradFilterTransform
(
weight_data
,
trans_weight_
,
kernel_unit_
,
input_unit_
,
conv_param_
);
WinogradFilterTransform
(
weight_data
,
trans_weight_
,
kernel_unit_
,
input_unit_
,
conv_param_
,
oc_block
);
// init bias
size_t
new_bias_size
=
oc4
*
C4NUM
*
sizeof
(
float
);
...
...
@@ -112,14 +120,12 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
return
RET_OK
;
}
int
ConvolutionWinogradCPUKernel
::
MallocFilterMatrix
()
{
int
ConvolutionWinogradCPUKernel
::
MallocFilterMatrix
(
int
oc_block
,
int
oc_block_num
)
{
int
channel_in
=
conv_param_
->
input_channel_
;
int
channel_out
=
conv_param_
->
output_channel_
;
int
ic4
=
UP_DIV
(
channel_in
,
BLOCK
);
int
oc8
=
UP_DIV
(
channel_out
,
C8NUM
);
// set data
auto
trans_matrix_data_size
=
input_unit_
*
input_unit_
*
ic4
*
oc8
*
C4NUM
*
C8NUM
*
sizeof
(
float
);
auto
trans_matrix_data_size
=
input_unit_
*
input_unit_
*
ic4
*
C4NUM
*
oc_block_num
*
oc_block
*
sizeof
(
float
);
auto
matrix_buffer
=
malloc
(
trans_matrix_data_size
);
if
(
matrix_buffer
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc matrix_buffer failed."
;
...
...
@@ -134,10 +140,10 @@ int ConvolutionWinogradCPUKernel::MallocFilterMatrix() {
std
::
vector
<
int
>
strides
;
// set shape
shapes
.
push_back
(
input_unit_
*
input_unit_
);
shapes
.
push_back
(
oc
8
);
shapes
.
push_back
(
oc
_block_num
);
shapes
.
push_back
(
ic4
);
shapes
.
push_back
(
C4NUM
);
shapes
.
push_back
(
C8NUM
);
shapes
.
push_back
(
oc_block
);
// set stride
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
int
stride
=
1
;
...
...
@@ -227,6 +233,11 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() {
MS_LOG
(
ERROR
)
<<
"Get output_trans_func_ failed."
;
return
RET_ERROR
;
}
#ifdef ENABLE_ARM32
gemm_func_
=
IndirectGemmFp32_8x4
;
#else
gemm_func_
=
IndirectGemmFp32_8x8
;
#endif
return
RET_OK
;
}
...
...
@@ -301,10 +312,14 @@ int ConvolutionWinogradCPUKernel::ReSize() {
}
int
ConvolutionWinogradCPUKernel
::
RunImpl
(
int
task_id
)
{
if
(
gemm_func_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"gemm_func is nullptr."
;
return
RET_ERROR
;
}
auto
output_addr
=
reinterpret_cast
<
float
*>
(
outputs_
.
at
(
kOutputIndex
)
->
Data
());
ConvWinogardFp32
(
reinterpret_cast
<
float
*>
(
nhwc4_input_
),
reinterpret_cast
<
float
*>
(
trans_weight_
->
GetData
()),
reinterpret_cast
<
const
float
*>
(
bias_data_
),
output_addr
,
tmp_buffer_address_list_
,
task_id
,
conv_param_
,
input_trans_func_
,
output_trans_func_
);
conv_param_
,
input_trans_func_
,
output_trans_func_
,
gemm_func_
);
return
RET_OK
;
}
...
...
@@ -335,4 +350,3 @@ int ConvolutionWinogradCPUKernel::Run() {
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
浏览文件 @
4edcce36
...
...
@@ -50,7 +50,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int
Run
()
override
;
int
RunImpl
(
int
task_id
);
int
InitWeightBias
();
int
MallocFilterMatrix
();
int
MallocFilterMatrix
(
int
oc_block
,
int
oc_block_num
);
int
InitTmpBuffer
();
int
ConfigInputOutput
();
...
...
@@ -66,9 +66,9 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
InputTransformUnitFunc
input_trans_func_
;
OutputTransformUnitFunc
output_trans_func_
;
TmpBufferAddress
tmp_buffer_address_list_
[
5
];
GEMM_FUNC_FP32
gemm_func_
=
nullptr
;
};
void
WinogradFilterTransform
(
const
float
*
weight_data
,
Matrix
*
trans_weight
,
int
kernel_unit
,
int
input_unit
,
ConvParameter
*
conv_param
);
ConvParameter
*
conv_param
,
int
oc_block
);
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
浏览文件 @
4edcce36
...
...
@@ -17,7 +17,7 @@
#include "src/runtime/kernel/arm/opclib/common_func.h"
#include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"
#ifndef
__aarch64__
#ifndef
ENABLE_ARM64
void
IndirectGemmFp32
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
int
ic4
,
int
output_channel
,
size_t
offset
,
size_t
relu
,
size_t
relu6
)
{
for
(
int
i
=
0
;
i
<
TILE_NUM
;
i
++
)
{
...
...
@@ -102,6 +102,11 @@ void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight
}
}
#endif
#ifndef ENABLE_ARM32
void
IndirectGemmFp32_8x4
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
)
{}
#endif
int8_t
MinInt8
(
int8_t
a
,
int8_t
b
)
{
return
b
^
((
a
^
b
)
&
-
(
a
<
b
));
}
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
浏览文件 @
4edcce36
...
...
@@ -36,6 +36,9 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane
void
IndirectGemmFp32_8x8
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
);
void
IndirectGemmFp32_8x4
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
);
void
IndirectGemmFp32_Comm
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
size_t
ic4
,
size_t
hw
,
size_t
oc
,
size_t
offset
);
void
IndirectGemmFp32
(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
int
ic4
,
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc
浏览文件 @
4edcce36
...
...
@@ -20,7 +20,8 @@
// fp32 conv common
void
ConvFp32
(
float
*
input_data
,
float
*
packed_input
,
float
*
packed_weight
,
const
float
*
bias_data
,
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
)
{
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
)
{
int
kernel_h
=
conv_param
->
kernel_h_
;
int
kernel_w
=
conv_param
->
kernel_w_
;
int
in_batch
=
conv_param
->
input_batch_
;
...
...
@@ -57,12 +58,12 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
int
out_offset
=
thread_id
*
TILE_NUM
*
out_channel
+
out_batch_offset
;
if
(
real_cal_num
==
TILE_NUM
)
{
float
*
gemm_output
=
output_data
+
out_offset
;
IndirectGemmFp32_8x8
(
gemm_output
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
gemm_func
(
gemm_output
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
}
else
{
// res part
IndirectGemmFp32_8x8
(
tmp_out_block
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
gemm_func
(
tmp_out_block
,
gemm_input
,
packed_weight
,
bias_data
,
conv_depth
,
ic4
,
out_channel
,
output_offset
,
0
,
0
,
conv_param
->
is_relu_
,
conv_param
->
is_relu6_
);
memcpy
(
output_data
+
out_offset
,
tmp_out_block
,
real_cal_num
*
out_channel
*
sizeof
(
float
));
}
}
...
...
@@ -78,7 +79,8 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output
// fp32 conv winograd
void
ConvWinogardFp32
(
float
*
input_data
,
float
*
trans_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
)
{
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
,
GEMM_FUNC_FP32
gemm_func
)
{
int
thread_num
=
conv_param
->
thread_num_
;
int
input_unit
=
conv_param
->
input_unit_
;
int
in_batch
=
conv_param
->
input_batch_
;
...
...
@@ -111,8 +113,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
WinogradInputTransform
(
input_data
,
trans_input
,
tmp_data
,
cal_num
,
out_tile_index
,
out_w_block
,
conv_param
,
input_trans_func
);
// step 3 : gemm
IndirectGemmFp32_8x8
(
gemm_out
,
trans_input
,
trans_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
output_offset
,
1
,
1
,
0
,
0
);
gemm_func
(
gemm_out
,
trans_input
,
trans_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
output_offset
,
1
,
1
,
0
,
0
);
// step 4 : output transform
WinogradOutputTransform
(
gemm_out
,
tmp_out_data
,
bias_data
,
cal_num
,
out_tile_index
,
out_w_block
,
conv_param
,
...
...
@@ -173,7 +175,7 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i
// fp32 conv3x3
void
Conv3x3Fp32
(
float
*
input_data
,
float
*
transed_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
)
{
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
)
{
int
thread_count
=
conv_param
->
thread_num_
;
int
ic4
=
UP_DIV
(
conv_param
->
input_channel_
,
C4NUM
);
int
output_channel
=
conv_param
->
output_channel_
;
...
...
@@ -198,8 +200,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
Conv3x3Fp32InputTransform
(
input_data
,
tile_buffer
,
block_unit_buffer
,
start_index
,
real_cal_num
,
out_w_block
,
conv_param
);
IndirectGemmFp32_8x8
(
tmp_dst_buffer
,
tile_buffer
,
transed_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
oc4
*
C4NUM
*
input_unit_square
*
sizeof
(
float
),
1
,
1
,
0
,
0
);
gemm_func
(
tmp_dst_buffer
,
tile_buffer
,
transed_weight
,
nullptr
,
input_unit_square
,
ic4
,
oc4
*
C4NUM
,
oc4
*
C4NUM
*
input_unit_square
*
sizeof
(
float
),
1
,
1
,
0
,
0
);
Conv3x3Fp32OutputTransform
(
tmp_dst_buffer
,
nc4hw4_out
,
bias_data
,
start_index
,
real_cal_num
,
out_w_block
,
conv_param
);
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h
浏览文件 @
4edcce36
...
...
@@ -28,10 +28,14 @@
#include "src/runtime/kernel/arm/opclib/winograd_utils.h"
using
TmpBufferAddress
=
float
*
;
typedef
void
(
*
GEMM_FUNC_FP32
)(
float
*
output
,
const
float
*
input
,
const
float
*
weight
,
const
float
*
bias
,
size_t
step
,
size_t
ic4
,
size_t
output_channel
,
size_t
offset
,
size_t
mode
,
size_t
writeC4
,
size_t
relu
,
size_t
relu6
);
// fp32 convolution common (im2col+gemm)
void
ConvFp32
(
float
*
input_data
,
float
*
packed_input
,
float
*
packed_weight
,
const
float
*
bias_data
,
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
);
float
*
tmp_out_block
,
float
*
output_data
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
);
// fp32 conv1x1 strassen matmul
int
Conv1x1Fp32
(
const
float
*
input_data
,
const
float
*
weight_data
,
float
*
output_data
,
float
*
tmp_ptr
,
...
...
@@ -40,12 +44,13 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output
// fp32 convolution winograd
void
ConvWinogardFp32
(
float
*
input_data
,
float
*
trans_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
);
InputTransformUnitFunc
input_trans_func
,
OutputTransformUnitFunc
output_trans_func
,
GEMM_FUNC_FP32
gemm_func
);
void
UnPackWinogradOutput
(
const
float
*
src
,
float
*
dst
,
int
batch
,
int
height
,
int
width
,
int
channel
,
int
output_unit
);
// fp32 conv3x3
void
Conv3x3Fp32
(
float
*
input_data
,
float
*
transed_weight
,
const
float
*
bias_data
,
float
*
output_data
,
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
);
TmpBufferAddress
*
buffer_list
,
int
task_id
,
ConvParameter
*
conv_param
,
GEMM_FUNC_FP32
gemm_func
);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_H_
mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
浏览文件 @
4edcce36
...
...
@@ -49,7 +49,9 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in
#ifdef __aarch64__
IndirectGemmInt8_4x4
(
dst
,
src
,
weight
,
bias
,
kernel_plane
,
ic4
,
output_channel
,
output_channel
*
sizeof
(
int8_t
),
input_sum
,
act_min
,
act_max
,
out_zp
,
out_multiplier
,
shift_before
,
shift_after
);
// todo arm32
#elif defined(ENABLE_ARM32)
IndirectGemmInt8_2x4
(
dst
,
src
,
weight
,
bias
,
kernel_plane
,
ic4
,
output_channel
,
output_channel
*
sizeof
(
int8_t
),
input_sum
,
act_min
,
act_max
,
out_zp
,
out_multiplier
,
shift_before
,
shift_after
);
#else
int
tile_num
=
conv_param
->
tile_num_
;
int
plane_c4
=
UP_DIV
(
kernel_plane
,
C4NUM
);
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h
浏览文件 @
4edcce36
...
...
@@ -58,10 +58,10 @@ class OptimizeModule {
if
((
!
support_optimize_ops
)
&&
(
!
support_fp16
))
{
return
;
}
//
optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY);
//
if (optimized_op_handler_ == nullptr) {
//
printf("Open optimize shared library failed.\n");
//
}
optimized_op_handler_
=
dlopen
(
OPTIMIZE_SHARED_LIBRARY_PATH
,
RTLD_LAZY
);
if
(
optimized_op_handler_
==
nullptr
)
{
printf
(
"Open optimize shared library failed.
\n
"
);
}
}
~
OptimizeModule
()
=
default
;
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
浏览文件 @
4edcce36
...
...
@@ -18,20 +18,19 @@
#include <cstring>
#include <cstdlib>
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
)
{
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
,
int
oc_block
,
int
oc_block_num
)
{
// original weight format : ohwi
// todo pack weight for arm32 platform
int
kernel_h
=
conv_param
->
kernel_h_
;
int
kernel_w
=
conv_param
->
kernel_w_
;
int
in_channel
=
conv_param
->
input_channel_
;
int
out_channel
=
conv_param
->
output_channel_
;
int
oc8
=
UP_DIV
(
out_channel
,
C8NUM
);
int
ic4
=
UP_DIV
(
in_channel
,
C4NUM
);
int
kernel_plane
=
kernel_h
*
kernel_w
;
int
pack_weight_size
=
oc
8
*
C8NUM
*
ic4
*
C4NUM
*
kernel_plane
;
int
pack_weight_size
=
oc
_block
*
oc_block_num
*
ic4
*
C4NUM
*
kernel_plane
;
int
unit_size
=
C8NUM
*
C4NUM
;
int
block_size
=
pack_weight_size
/
oc
8
;
int
unit_size
=
oc_block
*
C4NUM
;
int
block_size
=
pack_weight_size
/
oc
_block_num
;
for
(
int
m
=
0
;
m
<
kernel_plane
;
m
++
)
{
int
kernel_plane_stride
=
m
*
in_channel
;
...
...
@@ -43,12 +42,12 @@ void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed
int
real_ic_num
=
ic_remainder
<
C4NUM
?
ic_remainder
:
C4NUM
;
for
(
int
h
=
0
;
h
<
real_ic_num
;
h
++
)
{
int
block_stride
=
channel_block_stride
+
h
;
int
packed_block_stride
=
packed_channel_block_size
+
h
*
C8NUM
;
for
(
int
j
=
0
;
j
<
oc
8
;
j
++
)
{
int
kernel_block_stride
=
block_stride
+
j
*
C8NUM
*
kernel_plane
*
in_channel
;
int
packed_block_stride
=
packed_channel_block_size
+
h
*
oc_block
;
for
(
int
j
=
0
;
j
<
oc
_block_num
;
j
++
)
{
int
kernel_block_stride
=
block_stride
+
j
*
oc_block
*
kernel_plane
*
in_channel
;
int
packed_kernel_block_size
=
packed_block_stride
+
j
*
block_size
;
int
oc_remainder
=
out_channel
-
j
*
C8NUM
;
int
real_oc_num
=
oc_remainder
<
C8NUM
?
oc_remainder
:
C8NUM
;
int
oc_remainder
=
out_channel
-
j
*
oc_block
;
int
real_oc_num
=
oc_remainder
<
oc_block
?
oc_remainder
:
oc_block
;
for
(
int
k
=
0
;
k
<
real_oc_num
;
k
++
)
{
float
*
origin_data_ptr
=
weight_data
+
kernel_block_stride
+
k
*
kernel_plane
*
in_channel
;
float
*
packed_data_ptr
=
packed_weight
+
packed_kernel_block_size
+
k
;
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
浏览文件 @
4edcce36
...
...
@@ -40,7 +40,8 @@ void MatrixPack(const float *src, float *dst, int row, int ic4, int stride);
void
PackInputToC8Int8
(
const
int8_t
*
input_data
,
int16_t
*
packed_input
,
ConvParameter
*
conv_param
);
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
);
void
PackWeightFp32
(
float
*
weight_data
,
ConvParameter
*
conv_param
,
float
*
packed_weight
,
int
oc_block
,
int
oc_block_num
);
void
PackWeightInt8
(
int8_t
*
weight_data
,
ConvParameter
*
conv_param
,
int8_t
*
packed_weight
,
int32_t
*
weight_sum
);
...
...
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc
浏览文件 @
4edcce36
...
...
@@ -326,18 +326,18 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
}
}
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
)
{
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
,
int
oc_block
)
{
int
input_unit
=
4
;
int
dst_step
=
iC4
*
C4NUM
*
C8NUM
;
int
dst_step
=
iC4
*
C4NUM
*
oc_block
;
for
(
int
o
=
0
;
o
<
output_channel
;
o
++
)
{
int
oc
8_block_num
=
o
/
C8NUM
;
int
oc
8_block_rem
=
o
%
C8NUM
;
int
oc
_block_num
=
o
/
oc_block
;
int
oc
_block_rem
=
o
%
oc_block
;
int
src_oc_offset
=
o
*
iC4
*
C4NUM
*
kernel_plane
;
int
dst_oc_offset
=
oc
8_block_num
*
C8NUM
*
iC4
*
C4NUM
*
input_unit
*
input_unit
+
oc8
_block_rem
;
int
dst_oc_offset
=
oc
_block_num
*
oc_block
*
iC4
*
C4NUM
*
input_unit
*
input_unit
+
oc
_block_rem
;
for
(
int
i
=
0
;
i
<
iC4
;
i
++
)
{
float
*
src_ic4_ptr
=
weight_data
+
src_oc_offset
+
i
*
kernel_plane
*
C4NUM
;
float
*
dst_ic4_ptr
=
trans_weight
+
dst_oc_offset
+
i
*
C8NUM
*
C4NUM
;
float
*
dst_ic4_ptr
=
trans_weight
+
dst_oc_offset
+
i
*
oc_block
*
C4NUM
;
#ifdef ENABLE_ARM
float32x4_t
g00
=
vld1q_f32
(
src_ic4_ptr
);
float32x4_t
g01
=
vld1q_f32
(
src_ic4_ptr
+
4
);
...
...
@@ -1368,4 +1368,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons
}
}
}
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h
浏览文件 @
4edcce36
...
...
@@ -43,7 +43,8 @@ void Conv3x3Fp32InputUnit(const float *tmp_data, float *trans_input_data, size_t
void
Conv3x3Fp32InputTransform
(
const
float
*
input_data
,
float
*
trans_input
,
float
*
tmp_data
,
int
start_index
,
int
real_cal_num
,
int
out_w_block
,
ConvParameter
*
conv_param
);
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
);
void
Conv3x3Fp32FilterTransform
(
float
*
weight_data
,
float
*
trans_weight
,
int
iC4
,
int
output_channel
,
int
kernel_plane
,
int
oc_block
);
void
Conv3x3Fp32OutputUnit
(
const
float
*
gemm_out
,
const
float
*
bias_data
,
float
*
output_data
,
bool
h_not_bound
,
bool
w_not_bound
,
int
output_w
);
...
...
@@ -67,4 +68,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons
int
real_cal_num
,
int
out_w_block
,
ConvParameter
*
conv_param
);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_WINOGRAD_TRANSFORM_H_
mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
浏览文件 @
4edcce36
...
...
@@ -122,7 +122,7 @@ TEST_F(TestPack, PackWeightFp32) {
std
::
string
weight_path
=
"./test_data/conv/convfp32_weight_32_3_3_3.bin"
;
auto
weight_data
=
reinterpret_cast
<
float
*>
(
mindspore
::
lite
::
ReadFile
(
weight_path
.
c_str
(),
&
weight_size
));
auto
packed_weight
=
reinterpret_cast
<
float
*>
(
malloc
(
k_h
*
k_w
*
ic4
*
C4NUM
*
oc8
*
C8NUM
*
sizeof
(
float
)));
PackWeightFp32
(
weight_data
,
conv_param
,
packed_weight
);
PackWeightFp32
(
weight_data
,
conv_param
,
packed_weight
,
C8NUM
,
oc8
);
printf
(
"==================output data=================
\n
"
);
for
(
int
i
=
0
;
i
<
20
;
i
++
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录