Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
正统之独孤求败
mindspore
提交
7c0db5e2
M
mindspore
项目概览
正统之独孤求败
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
7c0db5e2
编写于
8月 18, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 18, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4648 optimize fp16 conv3x3 input transform
Merge pull request !4648 from fuzhiye/tmp
上级
ff2851a2
c8e3ab2b
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
450 addition
and
808 deletion
+450
-808
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
.../lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+13
-14
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
.../src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+12
-17
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c
+9
-8
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
+22
-14
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h
+2
-2
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c
...c/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c
+232
-200
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c
...e/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c
+160
-553
未找到文件。
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
浏览文件 @
7c0db5e2
...
...
@@ -36,15 +36,15 @@ void ProcessFilterFp16(float16_t *origin_weight, float16_t *dst_weight, ConvPara
auto
input_channel
=
conv_param
->
input_channel_
;
auto
output_channel
=
conv_param
->
output_channel_
;
auto
kernel_plane
=
conv_param
->
kernel_w_
*
conv_param
->
kernel_h_
;
int
iC
4
=
UP_DIV
(
input_channel
,
C4
NUM
);
int
iC
8
=
UP_DIV
(
input_channel
,
C8
NUM
);
int
oC8
=
UP_DIV
(
output_channel
,
C8NUM
);
size_t
tmp_size
=
oC8
*
C8NUM
*
iC
4
*
C4
NUM
*
kernel_plane
*
sizeof
(
float16_t
);
size_t
tmp_size
=
oC8
*
C8NUM
*
iC
8
*
C8
NUM
*
kernel_plane
*
sizeof
(
float16_t
);
auto
tmp_addr
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
tmp_size
));
memset
(
tmp_addr
,
0
,
tmp_size
);
PackWeightToC4Fp16
(
origin_weight
,
tmp_addr
,
conv_param
);
Conv3x3Fp16FilterTransform
(
tmp_addr
,
dst_weight
,
iC
4
,
output_channel
,
kernel_plane
);
Conv3x3Fp16FilterTransform
(
tmp_addr
,
dst_weight
,
iC
8
*
2
,
output_channel
,
kernel_plane
);
free
(
tmp_addr
);
}
...
...
@@ -52,10 +52,10 @@ void ProcessFilterFp16(float16_t *origin_weight, float16_t *dst_weight, ConvPara
int
Convolution3x3FP16CPUKernel
::
InitWeightBias
()
{
auto
input_channel
=
conv_param_
->
input_channel_
;
int
output_channel
=
conv_param_
->
output_channel_
;
int
iC
4
=
UP_DIV
(
input_channel
,
C4
NUM
);
int
iC
8
=
UP_DIV
(
input_channel
,
C8
NUM
);
int
oC8
=
UP_DIV
(
output_channel
,
C8NUM
);
// init weight
size_t
transformed_size
=
iC
4
*
C4
NUM
*
oC8
*
C8NUM
*
36
*
sizeof
(
float16_t
);
size_t
transformed_size
=
iC
8
*
C8
NUM
*
oC8
*
C8NUM
*
36
*
sizeof
(
float16_t
);
transformed_filter_addr_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
transformed_size
));
if
(
transformed_filter_addr_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc transformed_filter_addr_ failed."
;
...
...
@@ -92,11 +92,11 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() {
int
Convolution3x3FP16CPUKernel
::
InitTmpBuffer
()
{
const
int
tile_num
=
16
;
const
int
k_plane
=
36
;
int
iC
4
=
UP_DIV
(
conv_param_
->
input_channel_
,
C4
NUM
);
int
iC
8
=
UP_DIV
(
conv_param_
->
input_channel_
,
C8
NUM
);
int
oC8
=
UP_DIV
(
conv_param_
->
output_channel_
,
C8NUM
);
/*=============================tile_buffer_============================*/
size_t
tile_buffer_size
=
thread_count_
*
tile_num
*
k_plane
*
iC
4
*
C4
NUM
*
sizeof
(
float16_t
);
size_t
tile_buffer_size
=
thread_count_
*
tile_num
*
k_plane
*
iC
8
*
C8
NUM
*
sizeof
(
float16_t
);
tile_buffer_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
tile_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tile_buffer_ failed."
;
...
...
@@ -105,7 +105,7 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
memset
(
tile_buffer_
,
0
,
tile_buffer_size
);
/*=============================block_unit_buffer_============================*/
size_t
block_unit_buffer_size
=
thread_count_
*
k_plane
*
C
4
NUM
*
sizeof
(
float16_t
);
size_t
block_unit_buffer_size
=
thread_count_
*
k_plane
*
C
8
NUM
*
sizeof
(
float16_t
);
block_unit_buffer_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
block_unit_buffer_size
));
if
(
block_unit_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc block_unit_buffer_ failed."
;
...
...
@@ -133,14 +133,14 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
memset
(
tmp_out_
,
0
,
tmp_out_size
);
/*=============================nhwc4_input_============================*/
size_t
nhwc
4
_input_size
=
iC
4
*
C4
NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc
4
_input_size
);
size_t
nhwc
8
_input_size
=
iC
8
*
C8
NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc
8
_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc
4
_input_size
);
memset
(
nhwc4_input_
,
0
,
nhwc
8
_input_size
);
return
RET_OK
;
}
...
...
@@ -189,7 +189,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
MS_LOG
(
ERROR
)
<<
"Init tmp buffer failed."
;
return
RET_ERROR
;
}
ConfigInputOutput
();
return
RET_OK
;
}
...
...
@@ -225,7 +224,7 @@ int Convolution3x3FP16CPUKernel::Run() {
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
convert_func_
(
reinterpret_cast
<
void
*>
(
execute_input_
),
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
PackNHWCToNHWC8Fp16
(
reinterpret_cast
<
void
*>
(
execute_input_
),
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
int
error_code
=
LiteBackendParallelLaunch
(
Convolution3x3Fp16Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
...
...
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
浏览文件 @
7c0db5e2
...
...
@@ -150,10 +150,11 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
int
ConvolutionWinogradFP16CPUKernel
::
MallocFilterMatrix
(
int
oc_block
,
int
oc_block_num
)
{
int
channel_in
=
conv_param_
->
input_channel_
;
int
ic4
=
UP_DIV
(
channel_in
,
BLOCK
);
int
ic8
=
UP_DIV
(
channel_in
,
C8NUM
);
int
ic4
=
ic8
*
2
;
// set data
auto
trans_matrix_data_size
=
input_unit_
*
input_unit_
*
ic
4
*
C4
NUM
*
oc_block_num
*
oc_block
*
sizeof
(
float
);
auto
trans_matrix_data_size
=
input_unit_
*
input_unit_
*
ic
8
*
C8
NUM
*
oc_block_num
*
oc_block
*
sizeof
(
float
);
auto
matrix_buffer
=
malloc
(
trans_matrix_data_size
);
if
(
matrix_buffer
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc matrix_buffer failed."
;
...
...
@@ -191,11 +192,11 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
int
channel_out
=
conv_param_
->
output_channel_
;
int
output_h
=
conv_param_
->
output_h_
;
int
output_w
=
conv_param_
->
output_w_
;
int
ic
4
=
UP_DIV
(
channel_in
,
C4
NUM
);
int
ic
8
=
UP_DIV
(
channel_in
,
C8
NUM
);
int
oc8
=
UP_DIV
(
channel_out
,
C8NUM
);
/*=============================trans_input_============================*/
size_t
tile_buffer_size
=
thread_count_
*
cal_num
*
input_unit_
*
input_unit_
*
ic
4
*
C4
NUM
*
sizeof
(
float16_t
);
size_t
tile_buffer_size
=
thread_count_
*
cal_num
*
input_unit_
*
input_unit_
*
ic
8
*
C8
NUM
*
sizeof
(
float16_t
);
trans_input_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
tile_buffer_size
));
if
(
trans_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc trans_input_ failed."
;
...
...
@@ -223,12 +224,12 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
/*=============================tmp_data_============================*/
tmp_data_
=
reinterpret_cast
<
float16_t
*>
(
malloc
(
thread_count_
*
C
4
NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float16_t
)));
reinterpret_cast
<
float16_t
*>
(
malloc
(
thread_count_
*
C
8
NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float16_t
)));
if
(
tmp_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc tmp_data_ failed."
;
return
RET_ERROR
;
}
memset
(
tmp_data_
,
0
,
C
4
NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float16_t
));
memset
(
tmp_data_
,
0
,
C
8
NUM
*
input_unit_
*
input_unit_
*
sizeof
(
float16_t
));
tmp_buffer_address_list_
[
0
]
=
trans_input_
;
tmp_buffer_address_list_
[
1
]
=
gemm_out_
;
...
...
@@ -236,24 +237,18 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
tmp_buffer_address_list_
[
3
]
=
tmp_data_
;
/*=============================nhwc4_input_============================*/
size_t
nhwc
4
_input_size
=
ic
4
*
C4
NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc
4
_input_size
);
size_t
nhwc
8
_input_size
=
ic
8
*
C8
NUM
*
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
sizeof
(
float16_t
);
nhwc4_input_
=
malloc
(
nhwc
8
_input_size
);
if
(
nhwc4_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"malloc nhwc4_input_ failed."
;
return
RET_ERROR
;
}
memset
(
nhwc4_input_
,
0
,
nhwc
4
_input_size
);
memset
(
nhwc4_input_
,
0
,
nhwc
8
_input_size
);
return
RET_OK
;
}
int
ConvolutionWinogradFP16CPUKernel
::
ConfigInputOutput
()
{
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
ret
=
CheckLayout
(
input_tensor
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Check layout failed."
;
return
RET_ERROR
;
}
auto
output_tensor
=
out_tensors_
.
at
(
kOutputIndex
);
output_tensor
->
SetFormat
(
schema
::
Format_NHWC
);
...
...
@@ -348,7 +343,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
int
in_h
=
conv_param_
->
input_h_
;
int
in_w
=
conv_param_
->
input_w_
;
int
in_channel
=
conv_param_
->
input_channel_
;
convert_func_
(
execute_input_
,
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
PackNHWCToNHWC8Fp16
(
execute_input_
,
nhwc4_input_
,
in_batch
,
in_h
*
in_w
,
in_channel
);
int
error_code
=
LiteBackendParallelLaunch
(
ConvolutionWinogradFp16Impl
,
this
,
thread_count_
);
if
(
error_code
!=
RET_OK
)
{
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/conv_fp16.c
浏览文件 @
7c0db5e2
...
...
@@ -382,7 +382,8 @@ void Conv3x3Fp16(float16_t *input_data, float16_t *transed_weight, const float16
const
int
tile_num
=
16
;
const
int
output_unit
=
4
;
const
int
k_plane
=
36
;
int
ic4
=
UP_DIV
(
conv_param
->
input_channel_
,
C4NUM
);
int
ic8
=
UP_DIV
(
conv_param
->
input_channel_
,
C8NUM
);
int
ic4
=
ic8
*
2
;
int
oc8
=
UP_DIV
(
conv_param
->
output_channel_
,
C8NUM
);
int
out_w_block
=
UP_DIV
(
conv_param
->
output_w_
,
C4NUM
);
...
...
@@ -390,7 +391,7 @@ void Conv3x3Fp16(float16_t *input_data, float16_t *transed_weight, const float16
int
output_count
=
out_w_block
*
out_h_block
;
int
output_tile_count
=
UP_DIV
(
output_count
,
tile_num
);
int
tile_buffer_offset
=
tile_num
*
k_plane
*
ic4
*
C4NUM
;
int
block_unit_buffer_offset
=
k_plane
*
C
4
NUM
;
int
block_unit_buffer_offset
=
k_plane
*
C
8
NUM
;
int
tmp_dst_buffer_offset
=
tile_num
*
k_plane
*
oc8
*
C8NUM
;
int
input_batch
=
conv_param
->
input_batch_
;
...
...
@@ -541,7 +542,7 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
int
input_unit
=
conv_param
->
input_unit_
;
int
in_batch
=
conv_param
->
input_batch_
;
int
in_channel
=
conv_param
->
input_channel_
;
int
ic
4
=
UP_DIV
(
in_channel
,
C4
NUM
);
int
ic
8
=
UP_DIV
(
in_channel
,
C8
NUM
);
int
out_unit
=
conv_param
->
output_unit_
;
int
out_w_block
=
UP_DIV
(
conv_param
->
output_w_
,
out_unit
);
int
out_h_block
=
UP_DIV
(
conv_param
->
output_h_
,
out_unit
);
...
...
@@ -557,16 +558,16 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
float16_t
*
gemm_out
=
buffer_list
[
1
];
float16_t
*
tmp_out_data
=
buffer_list
[
2
];
float16_t
*
tmp_data
=
buffer_list
[
3
];
int
trans_input_offset
=
tile_num
*
input_unit_square
*
ic
4
*
C4
NUM
;
int
trans_input_offset
=
tile_num
*
input_unit_square
*
ic
8
*
C8
NUM
;
int
gemm_out_offset
=
tile_num
*
input_unit_square
*
oc8
*
C8NUM
;
int
tmp_data_offset
=
input_unit_square
*
C
4
NUM
;
int
tmp_data_offset
=
input_unit_square
*
C
8
NUM
;
// step 1 : filter transform (pre-processed offline)
// step 2 : input transform (online)
for
(
int
b
=
0
;
b
<
in_batch
;
b
++
)
{
int
in_batch_offset
=
b
*
ic
4
*
C4
NUM
*
conv_param
->
input_h_
*
conv_param
->
input_w_
;
int
in_batch_offset
=
b
*
ic
8
*
C8
NUM
*
conv_param
->
input_h_
*
conv_param
->
input_w_
;
int
tmp_out_batch_offset
=
b
*
out_w_block
*
out_h_block
*
out_unit
*
out_unit
*
oc8
*
C8NUM
;
for
(
int
thread_id
=
task_id
;
thread_id
<
output_tile_count
;
thread_id
+=
thread_num
)
{
int
out_tile_index
=
thread_id
*
TILE_NUM
;
int
out_tile_index
=
thread_id
*
tile_num
;
int
cal_num
=
output_count
-
thread_id
*
tile_num
;
cal_num
=
cal_num
>
tile_num
?
tile_num
:
cal_num
;
WinogradInputTransformFp16
(
input_data
+
in_batch_offset
,
trans_input
+
task_id
*
trans_input_offset
,
...
...
@@ -574,7 +575,7 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
input_trans_func
);
// step 3 : gemm
IndirectGemmFp16_16x8
(
gemm_out
+
task_id
*
gemm_out_offset
,
trans_input
+
task_id
*
trans_input_offset
,
trans_weight
,
NULL
,
input_unit_square
,
ic
4
,
oc8
*
C8NUM
,
output_offset
,
1
,
1
,
0
,
0
);
trans_weight
,
NULL
,
input_unit_square
,
ic
8
*
2
,
oc8
*
C8NUM
,
output_offset
,
1
,
1
,
0
,
0
);
// step 4 : output transform
WinogradOutputTransformFp16
(
gemm_out
+
task_id
*
gemm_out_offset
,
tmp_out_data
+
tmp_out_batch_offset
,
bias_data
,
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
浏览文件 @
7c0db5e2
...
...
@@ -161,7 +161,8 @@ void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_w
void
PackWeightToC4Fp16
(
const
float16_t
*
origin_weight_data
,
float16_t
*
packed_weight_data
,
ConvParameter
*
conv_param
)
{
// origin weight format : ohwi
int
input_channel
=
conv_param
->
input_channel_
;
int
ic4
=
UP_DIV
(
input_channel
,
C4NUM
);
int
ic8
=
UP_DIV
(
input_channel
,
C8NUM
);
int
ic4
=
ic8
*
2
;
int
output_channel
=
conv_param
->
output_channel_
;
int
kernel_plane
=
conv_param
->
kernel_h_
*
conv_param
->
kernel_w_
;
...
...
@@ -240,6 +241,26 @@ void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int c
}
}
void
PackNHWCToNHWC8Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
)
{
int
ic8
=
UP_DIV
(
channel
,
C8NUM
);
int
nhwc8_batch_unit_offset
=
ic8
*
C8NUM
*
plane
;
int
ic_remainder_
=
channel
%
C8NUM
;
if
(
ic_remainder_
!=
0
)
{
int
nhwc8_batch_offset
=
0
;
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
int
batch_offset
=
b
*
channel
*
plane
;
for
(
int
i
=
0
;
i
<
plane
;
i
++
)
{
memcpy
((
float16_t
*
)
dst
+
nhwc8_batch_offset
+
i
*
ic8
*
C8NUM
,
(
float16_t
*
)
src
+
batch_offset
+
i
*
channel
,
channel
*
sizeof
(
float16_t
));
}
nhwc8_batch_offset
+=
nhwc8_batch_unit_offset
;
}
}
else
{
size_t
ori_input_size
=
batch
*
plane
*
channel
*
sizeof
(
float16_t
);
memcpy
(
dst
,
src
,
ori_input_size
);
}
}
void
PackNHWC4ToNHWCFp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
)
{
int
c4
=
UP_DIV
(
channel
,
C4NUM
);
int
ic_remainder_
=
channel
%
C4NUM
;
...
...
@@ -399,19 +420,6 @@ void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, i
}
}
void
PackNHWCToNHWC8Fp16
(
float16_t
*
src
,
float16_t
*
dst
,
int
batch
,
int
plane
,
int
channel
)
{
int
c8_channel
=
UP_DIV
(
channel
,
C8NUM
)
*
C8NUM
;
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
float16_t
*
dst_batch
=
dst
+
b
*
plane
*
c8_channel
;
float16_t
*
src_batch
=
src
+
b
*
plane
*
channel
;
for
(
int
i
=
0
;
i
<
plane
;
i
++
)
{
float16_t
*
dst_plane
=
dst_batch
+
i
*
c8_channel
;
float16_t
*
src_plane
=
src_batch
+
i
*
channel
;
memcpy
(
dst_plane
,
src_plane
,
channel
*
sizeof
(
float16_t
));
}
}
}
void
PackNHWC8ToNHWCFp16
(
float16_t
*
src
,
float16_t
*
dst
,
int
batch
,
int
plane
,
int
channel
)
{
int
c8_channel
=
UP_DIV
(
channel
,
C8NUM
)
*
C8NUM
;
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.h
浏览文件 @
7c0db5e2
...
...
@@ -43,6 +43,8 @@ void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int
void
PackNHWCToNHWC4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWCToNHWC8Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWC4ToNHWCFp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNCHWToNHWC4Fp16
(
const
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
);
...
...
@@ -63,8 +65,6 @@ void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane,
void
PackNHWC8Fp16ToNHWCFp32
(
float16_t
*
src
,
float
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWCToNHWC8Fp16
(
float16_t
*
src
,
float16_t
*
dst
,
int
batch
,
int
plane
,
int
channel
);
void
PackNHWC8ToNHWCFp16
(
float16_t
*
src
,
float16_t
*
dst
,
int
batch
,
int
plane
,
int
channel
);
#ifdef __cplusplus
}
...
...
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_transform_fp16.c
浏览文件 @
7c0db5e2
此差异已折叠。
点击以展开。
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/winograd_utils_fp16.c
浏览文件 @
7c0db5e2
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录