Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
16da2f61
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
16da2f61
编写于
12月 16, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn): adjust the conv1x1 algorithm to support fp16 nchw88
GitOrigin-RevId: a79a3919cbe57cd07b7f9f443fab18092152ba64
上级
5c13146d
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
162 addition
and
104 deletion
+162
-104
dnn/src/fallback/conv_bias/conv1x1/algos.cpp
dnn/src/fallback/conv_bias/conv1x1/algos.cpp
+14
-3
dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp
dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp
+2
-0
dnn/src/fallback/conv_bias/opr_impl.cpp
dnn/src/fallback/conv_bias/opr_impl.cpp
+2
-2
dnn/test/aarch64/conv_bias.cpp
dnn/test/aarch64/conv_bias.cpp
+51
-19
dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
+0
-80
dnn/test/common/conv_bias.cpp
dnn/test/common/conv_bias.cpp
+87
-0
dnn/test/common/conv_bias.h
dnn/test/common/conv_bias.h
+6
-0
未找到文件。
dnn/src/fallback/conv_bias/conv1x1/algos.cpp
浏览文件 @
16da2f61
...
...
@@ -186,14 +186,25 @@ bool ConvBiasImpl::AlgoConv1x1::usable(
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
if
(
format
!=
param
::
ConvBias
::
Format
::
NCHW
&&
format
!=
param
::
ConvBias
::
Format
::
NCHW44
&&
format
!=
param
::
ConvBias
::
Format
::
NCHW44_DOT
)
{
format
!=
param
::
ConvBias
::
Format
::
NCHW44_DOT
&&
format
!=
param
::
ConvBias
::
Format
::
NCHW88
)
{
return
false
;
}
//! hybird mode is not support
if
(
param
.
filter_meta
.
format
==
param
::
ConvBias
::
Format
::
NCHW44
||
param
.
filter_meta
.
format
==
param
::
ConvBias
::
Format
::
NCHW44_DOT
)
{
if
(
param
.
filter_meta
.
icpg
<
4
_z
||
param
.
filter_meta
.
icpg
==
1
||
param
.
filter_meta
.
ocpg
==
1
)
{
if
(
param
.
filter_meta
.
icpg
<
4
_z
||
param
.
filter_meta
.
ocpg
==
1
)
{
return
false
;
}
}
if
(
format
==
param
::
ConvBias
::
Format
::
NCHW88
)
{
bool
is_packmode_not_default
=
(
m_matmul_algo
->
packmode
()
!=
MatrixMulImpl
::
AlgoBase
::
PackMode
::
DEFAULT
);
//! nchw88 hybrid mode and channel wise is not support
bool
is_hybrid_mode_or_channel_wise
=
(
param
.
filter_meta
.
icpg
<
8
_z
||
param
.
filter_meta
.
ocpg
==
1
);
if
(
is_packmode_not_default
||
is_hybrid_mode_or_channel_wise
)
{
return
false
;
}
}
...
...
dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp
浏览文件 @
16da2f61
...
...
@@ -38,6 +38,8 @@ MatrixMulImpl::KernSizeParam get_matmul_kern_param(
format
=
param
::
MatrixMul
::
Format
::
MK4
;
}
else
if
(
param
.
filter_meta
.
format
==
param
::
ConvBias
::
Format
::
NCHW44_DOT
)
{
format
=
param
::
MatrixMul
::
Format
::
MK4_DOT
;
}
else
if
(
param
.
filter_meta
.
format
==
param
::
ConvBias
::
Format
::
NCHW88
)
{
format
=
param
::
MatrixMul
::
Format
::
MK8
;
}
return
{
param
.
filter_type
,
...
...
dnn/src/fallback/conv_bias/opr_impl.cpp
浏览文件 @
16da2f61
...
...
@@ -695,7 +695,7 @@ size_t ConvBiasImpl::NCBKernParam::filter_offset(
(
group
%
8
==
0
&&
icpg
==
1
&&
ocpg
==
1
&&
pack_group_size
>
1
)
||
(
group
==
1
&&
ocpg
%
8
==
0
),
"The filter sh
e
pe is not right of nchw88"
);
"The filter sh
a
pe is not right of nchw88"
);
group_offset
=
pack_group_size
*
group_pack_id
*
filter_meta
.
icpg
*
filter_meta
.
ocpg
*
filter_meta
.
spatial
[
0
]
*
filter_meta
.
spatial
[
1
]
*
filter_type
.
size
();
...
...
@@ -717,7 +717,7 @@ size_t ConvBiasImpl::NCBKernParam::filter_offset(
(
group
%
4
==
0
&&
icpg
==
1
&&
ocpg
==
1
&&
pack_group_size
>
1
)
||
(
group
==
1
&&
ocpg
%
4
==
0
),
"The filter sh
e
pe is not right of nchw44"
);
"The filter sh
a
pe is not right of nchw44"
);
group_offset
=
pack_group_size
*
group_pack_id
*
filter_meta
.
icpg
*
filter_meta
.
ocpg
*
filter_meta
.
spatial
[
0
]
*
filter_meta
.
spatial
[
1
]
*
filter_type
.
size
();
...
...
dnn/test/aarch64/conv_bias.cpp
浏览文件 @
16da2f61
...
...
@@ -85,25 +85,6 @@ TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_RECORD) {
}
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
void
checker_conv_bias_fp16
(
std
::
vector
<
conv_bias
::
TestArg
>
args
,
Handle
*
handle
,
const
char
*
algo_name
,
float
epsilon
)
{
using
namespace
conv_bias
;
Checker
<
ConvBias
>
checker
(
handle
);
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBias
>
(
algo_name
));
checker
.
set_epsilon
(
epsilon
);
checker
.
set_dtype
(
0
,
dtype
::
Float16
())
.
set_dtype
(
1
,
dtype
::
Float16
())
.
set_dtype
(
2
,
dtype
::
Float16
())
.
set_dtype
(
4
,
dtype
::
Float16
());
NormalRNG
rng
(
1.
f
);
checker
.
set_rng
(
0
,
&
rng
).
set_rng
(
1
,
&
rng
);
for
(
auto
&&
arg
:
args
)
{
checker
.
set_param
(
arg
.
param
).
execs
({
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
AARCH64_MULTI_THREADS
,
CONVBIAS_DIRECT_FP16_STR2
)
{
NormalRNG
rng
(
1
);
...
...
@@ -111,6 +92,16 @@ TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP16_STR2) {
conv_bias
::
get_conv_bias_args
({
2
,
3
,
5
},
2
,
false
,
false
,
false
),
handle
(),
rng
,
"ARMV8F16STRD2"
,
0.04
);
}
TEST_F
(
AARCH64_MULTI_THREADS
,
CONVBIAS_CONV1x1_MATMUL_FP16_NCHW88
)
{
std
::
vector
<
conv_bias
::
TestArg
>&&
args_nchw88
=
conv_bias
::
get_nchw88_conv_bias_args
(
{
1
},
QUAN_NLMODE
,
BR_AND_BIAS_BIASMODE
,
1
,
0
);
NormalRNG
rng
(
1
);
checker_conv_bias_f16
(
args_nchw88
,
handle
(),
rng
,
"CONV1x1:AARCH64_F16_MK8_16X12X1"
,
0.03
);
}
#endif
#if MEGDNN_WITH_BENCHMARK
...
...
@@ -213,6 +204,47 @@ void benchmarker_conv_bias(
}
}
TEST_F
(
AARCH64
,
BENCHMARK_CONVBIAS_CONV1x1_MATMUL_VS_DIRECT_NCHW88
)
{
constexpr
size_t
RUNS
=
50
;
using
NLMode
=
param
::
ConvBias
::
NonlineMode
;
std
::
vector
<
conv_bias
::
TestArg
>
args_nchw88
;
auto
bench_case
=
[
&
](
size_t
N
,
size_t
IC
,
size_t
OC
,
size_t
H
,
size_t
W
,
size_t
FS
,
size_t
group
)
{
param
::
ConvBias
param_nchw88
;
param_nchw88
.
format
=
param
::
ConvBias
::
Format
::
NCHW88
;
for
(
size_t
pad
:
{
0
})
{
for
(
size_t
stride
:
{
1
})
{
for
(
auto
nlmode
:
{
NLMode
::
IDENTITY
})
{
param_nchw88
.
nonlineMode
=
nlmode
;
param_nchw88
.
pad_h
=
pad
;
param_nchw88
.
pad_w
=
pad
;
param_nchw88
.
stride_h
=
stride
;
param_nchw88
.
stride_w
=
stride
;
args_nchw88
.
emplace_back
(
param_nchw88
,
TensorShape
{
N
,
IC
/
8
,
H
,
W
,
8
},
TensorShape
{
OC
/
8
,
IC
/
group
/
8
,
FS
,
FS
,
8
,
8
},
TensorShape
{
1
,
OC
/
8
,
1
,
1
,
8
});
}
}
}
};
std
::
vector
<
DType
>
data_type_fp16
=
{
dtype
::
Float16
(),
dtype
::
Float16
(),
dtype
::
Float16
(),
dtype
::
Float16
()};
bench_case
(
1
,
32
,
64
,
112
,
112
,
1
,
1
);
bench_case
(
1
,
64
,
128
,
56
,
56
,
1
,
1
);
bench_case
(
1
,
128
,
256
,
28
,
28
,
1
,
1
);
bench_case
(
1
,
256
,
512
,
14
,
14
,
1
,
1
);
std
::
string
algo_name_nchw88
=
"CONV1x1:AARCH64_F16_MK8_16X12X1"
;
std
::
string
algo_name_nchw88_direct
=
"F16_CONV_NCHW88_DIRECT"
;
benchmark_with_contrast
(
args_nchw88
,
algo_name_nchw88
,
data_type_fp16
,
args_nchw88
,
algo_name_nchw88_direct
,
data_type_fp16
,
RUNS
,
{
1
,
{
4
}});
}
TEST_F
(
AARCH64
,
BENCHMARK_CONVBIAS_STRIDE2_FP32_FP16
)
{
benchmarker_conv_bias
(
get_conv_bias_benchmaker_args
({
2
,
3
,
5
,
7
},
2
),
handle
(),
"ARMV8F32STRD2"
,
...
...
dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp
浏览文件 @
16da2f61
...
...
@@ -69,86 +69,6 @@ void benchmark_impl(
}
}
void
benchmark_with_contrast
(
const
std
::
vector
<
conv_bias
::
TestArg
>&
args
,
const
std
::
string
algo_name
,
std
::
vector
<
DType
>&
data_type
,
const
std
::
vector
<
conv_bias
::
TestArg
>&
args_contrast
,
const
std
::
string
algo_name_contrast
,
std
::
vector
<
DType
>&
data_type_contrast
,
size_t
RUNS
,
TaskExecutorConfig
&&
single_thread_config
)
{
auto
single_thread_handle
=
create_cpu_handle
(
0
,
true
,
&
single_thread_config
);
auto
benchmarker
=
Benchmarker
<
ConvBias
>
(
single_thread_handle
.
get
());
auto
benchmarker_contrast
=
Benchmarker
<
ConvBias
>
(
single_thread_handle
.
get
());
benchmarker
.
set_times
(
RUNS
)
.
set_display
(
false
)
.
set_dtype
(
0
,
data_type
[
0
])
.
set_dtype
(
1
,
data_type
[
1
])
.
set_dtype
(
2
,
data_type
[
2
])
.
set_dtype
(
4
,
data_type
[
3
])
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBias
>
(
algo_name
.
c_str
()));
benchmarker_contrast
.
set_times
(
RUNS
)
.
set_display
(
false
)
.
set_dtype
(
0
,
data_type_contrast
[
0
])
.
set_dtype
(
1
,
data_type_contrast
[
1
])
.
set_dtype
(
2
,
data_type_contrast
[
2
])
.
set_dtype
(
4
,
data_type_contrast
[
3
])
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBias
>
(
algo_name_contrast
.
c_str
()));
size_t
arg_size
=
args
.
size
(),
arg_contrast_size
=
args_contrast
.
size
();
megdnn_assert
(
arg_size
==
arg_contrast_size
);
rep
(
i
,
arg_size
)
{
TensorLayout
dst_layout
,
dst_layout_contrast
;
auto
opr
=
single_thread_handle
.
get
()
->
create_operator
<
ConvBias
>
();
auto
&&
arg
=
args
[
i
];
opr
->
param
()
=
arg
.
param
;
opr
->
deduce_layout
(
{
arg
.
src
,
data_type
[
0
]},
{
arg
.
filter
,
data_type
[
1
]},
{
arg
.
bias
,
data_type
[
2
]},
{},
dst_layout
);
float
computation
=
(
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]
*
arg
.
filter
[
4
]
*
2.0
)
/
(
1024
*
1024
*
1024
)
*
1e3
;
benchmarker
.
set_param
(
arg
.
param
);
auto
used
=
benchmarker
.
exec
({
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}})
/
RUNS
;
auto
&&
arg_contrast
=
args_contrast
[
i
];
opr
->
param
()
=
arg_contrast
.
param
;
opr
->
deduce_layout
(
{
arg_contrast
.
src
,
data_type_contrast
[
0
]},
{
arg_contrast
.
filter
,
data_type_contrast
[
1
]},
{
arg_contrast
.
bias
,
data_type_contrast
[
2
]},
{},
dst_layout_contrast
);
float
computation_contrast
=
(
dst_layout_contrast
.
total_nr_elems
()
*
arg_contrast
.
filter
[
1
]
*
arg_contrast
.
filter
[
2
]
*
arg_contrast
.
filter
[
3
]
*
arg_contrast
.
filter
[
4
]
*
2.0
)
/
(
1024
*
1024
*
1024
)
*
1e3
;
benchmarker_contrast
.
set_param
(
arg_contrast
.
param
);
auto
used_contrast
=
benchmarker_contrast
.
exec
(
{
arg_contrast
.
src
,
arg_contrast
.
filter
,
arg_contrast
.
bias
,
{},
{}})
/
RUNS
;
printf
(
"Bench case:
\n
"
);
printf
(
"padding: %u, stride: %u, nonline mode: %u
\n
"
,
arg
.
param
.
pad_h
,
arg
.
param
.
stride_h
,
arg
.
param
.
nonlineMode
);
printf
(
"%s %s %s
\n
"
,
arg
.
src
.
to_string
().
c_str
(),
arg
.
filter
.
to_string
().
c_str
(),
arg
.
bias
.
to_string
().
c_str
());
printf
(
"%s %s %s
\n
"
,
arg_contrast
.
src
.
to_string
().
c_str
(),
arg_contrast
.
filter
.
to_string
().
c_str
(),
arg_contrast
.
bias
.
to_string
().
c_str
());
printf
(
"%s: %f gflops;
\n
%s: %f gflops
\n
"
"spead up = %f
\n
"
,
algo_name
.
c_str
(),
computation
/
used
,
algo_name_contrast
.
c_str
(),
computation_contrast
/
used_contrast
,
used_contrast
/
used
);
}
}
}
// namespace
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
...
...
dnn/test/common/conv_bias.cpp
浏览文件 @
16da2f61
...
...
@@ -1116,6 +1116,93 @@ void benchmark_winograd_compare(
used_winograd2
/
used_winograd1
);
}
}
void
benchmark_with_contrast
(
const
std
::
vector
<
conv_bias
::
TestArg
>&
args
,
const
std
::
string
algo_name
,
std
::
vector
<
DType
>&
data_type
,
const
std
::
vector
<
conv_bias
::
TestArg
>&
args_contrast
,
const
std
::
string
algo_name_contrast
,
std
::
vector
<
DType
>&
data_type_contrast
,
size_t
RUNS
,
TaskExecutorConfig
&&
single_thread_config
)
{
using
NLMode
=
param
::
ConvBias
::
NonlineMode
;
std
::
map
<
NLMode
,
std
::
string
>
nonlinemode2string
{
{
NLMode
::
IDENTITY
,
"Identity"
},
{
NLMode
::
RELU
,
"ReLU"
},
{
NLMode
::
SIGMOID
,
"Sigmoid"
},
{
NLMode
::
H_SWISH
,
"H_Swish"
}};
auto
single_thread_handle
=
create_cpu_handle
(
0
,
true
,
&
single_thread_config
);
auto
benchmarker
=
Benchmarker
<
ConvBias
>
(
single_thread_handle
.
get
());
auto
benchmarker_contrast
=
Benchmarker
<
ConvBias
>
(
single_thread_handle
.
get
());
benchmarker
.
set_times
(
RUNS
)
.
set_display
(
false
)
.
set_dtype
(
0
,
data_type
[
0
])
.
set_dtype
(
1
,
data_type
[
1
])
.
set_dtype
(
2
,
data_type
[
2
])
.
set_dtype
(
4
,
data_type
[
3
])
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBias
>
(
algo_name
.
c_str
()));
benchmarker_contrast
.
set_times
(
RUNS
)
.
set_display
(
false
)
.
set_dtype
(
0
,
data_type_contrast
[
0
])
.
set_dtype
(
1
,
data_type_contrast
[
1
])
.
set_dtype
(
2
,
data_type_contrast
[
2
])
.
set_dtype
(
4
,
data_type_contrast
[
3
])
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBias
>
(
algo_name_contrast
.
c_str
()));
size_t
arg_size
=
args
.
size
(),
arg_contrast_size
=
args_contrast
.
size
();
megdnn_assert
(
arg_size
==
arg_contrast_size
);
rep
(
i
,
arg_size
)
{
TensorLayout
dst_layout
,
dst_layout_contrast
;
auto
opr
=
single_thread_handle
.
get
()
->
create_operator
<
ConvBias
>
();
auto
&&
arg
=
args
[
i
];
opr
->
param
()
=
arg
.
param
;
opr
->
deduce_layout
(
{
arg
.
src
,
data_type
[
0
]},
{
arg
.
filter
,
data_type
[
1
]},
{
arg
.
bias
,
data_type
[
2
]},
{},
dst_layout
);
float
computation
=
(
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]
*
arg
.
filter
[
4
]
*
2.0
)
/
(
1024
*
1024
*
1024
)
*
1e3
;
benchmarker
.
set_param
(
arg
.
param
);
auto
used
=
benchmarker
.
exec
({
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}})
/
RUNS
;
auto
&&
arg_contrast
=
args_contrast
[
i
];
opr
->
param
()
=
arg_contrast
.
param
;
opr
->
deduce_layout
(
{
arg_contrast
.
src
,
data_type_contrast
[
0
]},
{
arg_contrast
.
filter
,
data_type_contrast
[
1
]},
{
arg_contrast
.
bias
,
data_type_contrast
[
2
]},
{},
dst_layout_contrast
);
float
computation_contrast
=
(
dst_layout_contrast
.
total_nr_elems
()
*
arg_contrast
.
filter
[
1
]
*
arg_contrast
.
filter
[
2
]
*
arg_contrast
.
filter
[
3
]
*
arg_contrast
.
filter
[
4
]
*
2.0
)
/
(
1024
*
1024
*
1024
)
*
1e3
;
benchmarker_contrast
.
set_param
(
arg_contrast
.
param
);
auto
used_contrast
=
benchmarker_contrast
.
exec
(
{
arg_contrast
.
src
,
arg_contrast
.
filter
,
arg_contrast
.
bias
,
{},
{}})
/
RUNS
;
printf
(
"Bench case:
\n
"
);
printf
(
"padding: %u, stride: %u, nonline mode: %s
\n
"
,
arg
.
param
.
pad_h
,
arg
.
param
.
stride_h
,
nonlinemode2string
[
arg
.
param
.
nonlineMode
].
c_str
());
printf
(
"%s %s %s
\n
"
,
arg
.
src
.
to_string
().
c_str
(),
arg
.
filter
.
to_string
().
c_str
(),
arg
.
bias
.
to_string
().
c_str
());
printf
(
"%s %s %s
\n
"
,
arg_contrast
.
src
.
to_string
().
c_str
(),
arg_contrast
.
filter
.
to_string
().
c_str
(),
arg_contrast
.
bias
.
to_string
().
c_str
());
printf
(
"%s: %f gflops;
\n
%s: %f gflops
\n
"
"spead up = %f
\n
"
,
algo_name
.
c_str
(),
computation
/
used
,
algo_name_contrast
.
c_str
(),
computation_contrast
/
used_contrast
,
used_contrast
/
used
);
}
}
#endif // MEGDNN_WITH_BENCHMARK
template
<
class
Checker
>
...
...
dnn/test/common/conv_bias.h
浏览文件 @
16da2f61
...
...
@@ -72,6 +72,12 @@ void benchmark_winograd_weight_preprocess(
void
benchmark_winograd_compare
(
const
char
*
algoA_name
,
const
char
*
algoB_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
=
1
,
size_t
io_pack_size
=
1
);
void
benchmark_with_contrast
(
const
std
::
vector
<
conv_bias
::
TestArg
>&
args
,
const
std
::
string
algo_name
,
std
::
vector
<
DType
>&
data_type
,
const
std
::
vector
<
conv_bias
::
TestArg
>&
args_contrast
,
const
std
::
string
algo_name_contrast
,
std
::
vector
<
DType
>&
data_type_contrast
,
size_t
RUNS
,
TaskExecutorConfig
&&
single_thread_config
);
#endif // MEGDNN_WITH_BENCHMARK
template
<
class
Checker
>
void
check_winograd
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录