提交 16da2f61 编写于 作者: M Megvii Engine Team

feat(dnn): adjust the conv1x1 algorithm to support fp16 nchw88

GitOrigin-RevId: a79a3919cbe57cd07b7f9f443fab18092152ba64
上级 5c13146d
......@@ -186,14 +186,25 @@ bool ConvBiasImpl::AlgoConv1x1::usable(
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
if (format != param::ConvBias::Format::NCHW &&
format != param::ConvBias::Format::NCHW44 &&
format != param::ConvBias::Format::NCHW44_DOT) {
format != param::ConvBias::Format::NCHW44_DOT &&
format != param::ConvBias::Format::NCHW88) {
return false;
}
//! hybird mode is not support
if (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) {
if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 ||
param.filter_meta.ocpg == 1) {
if (param.filter_meta.icpg < 4_z || param.filter_meta.ocpg == 1) {
return false;
}
}
if (format == param::ConvBias::Format::NCHW88) {
bool is_packmode_not_default =
(m_matmul_algo->packmode() !=
MatrixMulImpl::AlgoBase::PackMode::DEFAULT);
//! nchw88 hybrid mode and channel wise is not support
bool is_hybrid_mode_or_channel_wise =
(param.filter_meta.icpg < 8_z || param.filter_meta.ocpg == 1);
if (is_packmode_not_default || is_hybrid_mode_or_channel_wise) {
return false;
}
}
......
......@@ -38,6 +38,8 @@ MatrixMulImpl::KernSizeParam get_matmul_kern_param(
format = param::MatrixMul::Format::MK4;
} else if (param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) {
format = param::MatrixMul::Format::MK4_DOT;
} else if (param.filter_meta.format == param::ConvBias::Format::NCHW88) {
format = param::MatrixMul::Format::MK8;
}
return {param.filter_type,
......
......@@ -695,7 +695,7 @@ size_t ConvBiasImpl::NCBKernParam::filter_offset(
(group % 8 == 0 && icpg == 1 && ocpg == 1 &&
pack_group_size > 1) ||
(group == 1 && ocpg % 8 == 0),
"The filter shepe is not right of nchw88");
"The filter shape is not right of nchw88");
group_offset = pack_group_size * group_pack_id * filter_meta.icpg *
filter_meta.ocpg * filter_meta.spatial[0] *
filter_meta.spatial[1] * filter_type.size();
......@@ -717,7 +717,7 @@ size_t ConvBiasImpl::NCBKernParam::filter_offset(
(group % 4 == 0 && icpg == 1 && ocpg == 1 &&
pack_group_size > 1) ||
(group == 1 && ocpg % 4 == 0),
"The filter shepe is not right of nchw44");
"The filter shape is not right of nchw44");
group_offset = pack_group_size * group_pack_id * filter_meta.icpg *
filter_meta.ocpg * filter_meta.spatial[0] *
filter_meta.spatial[1] * filter_type.size();
......
......@@ -85,25 +85,6 @@ TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_RECORD) {
}
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
void checker_conv_bias_fp16(
std::vector<conv_bias::TestArg> args, Handle* handle, const char* algo_name,
float epsilon) {
using namespace conv_bias;
Checker<ConvBias> checker(handle);
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
checker.set_epsilon(epsilon);
checker.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_dtype(4, dtype::Float16());
NormalRNG rng(1.f);
checker.set_rng(0, &rng).set_rng(1, &rng);
for (auto&& arg : args) {
checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}});
}
}
TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP16_STR2) {
NormalRNG rng(1);
......@@ -111,6 +92,16 @@ TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP16_STR2) {
conv_bias::get_conv_bias_args({2, 3, 5}, 2, false, false, false), handle(),
rng, "ARMV8F16STRD2", 0.04);
}
TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_CONV1x1_MATMUL_FP16_NCHW88) {
std::vector<conv_bias::TestArg>&& args_nchw88 =
conv_bias::get_nchw88_conv_bias_args(
{1}, QUAN_NLMODE, BR_AND_BIAS_BIASMODE, 1, 0);
NormalRNG rng(1);
checker_conv_bias_f16(
args_nchw88, handle(), rng, "CONV1x1:AARCH64_F16_MK8_16X12X1", 0.03);
}
#endif
#if MEGDNN_WITH_BENCHMARK
......@@ -213,6 +204,47 @@ void benchmarker_conv_bias(
}
}
TEST_F(AARCH64, BENCHMARK_CONVBIAS_CONV1x1_MATMUL_VS_DIRECT_NCHW88) {
constexpr size_t RUNS = 50;
using NLMode = param::ConvBias::NonlineMode;
std::vector<conv_bias::TestArg> args_nchw88;
auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS,
size_t group) {
param::ConvBias param_nchw88;
param_nchw88.format = param::ConvBias::Format::NCHW88;
for (size_t pad : {0}) {
for (size_t stride : {1}) {
for (auto nlmode : {NLMode::IDENTITY}) {
param_nchw88.nonlineMode = nlmode;
param_nchw88.pad_h = pad;
param_nchw88.pad_w = pad;
param_nchw88.stride_h = stride;
param_nchw88.stride_w = stride;
args_nchw88.emplace_back(
param_nchw88, TensorShape{N, IC / 8, H, W, 8},
TensorShape{OC / 8, IC / group / 8, FS, FS, 8, 8},
TensorShape{1, OC / 8, 1, 1, 8});
}
}
}
};
std::vector<DType> data_type_fp16 = {
dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()};
bench_case(1, 32, 64, 112, 112, 1, 1);
bench_case(1, 64, 128, 56, 56, 1, 1);
bench_case(1, 128, 256, 28, 28, 1, 1);
bench_case(1, 256, 512, 14, 14, 1, 1);
std::string algo_name_nchw88 = "CONV1x1:AARCH64_F16_MK8_16X12X1";
std::string algo_name_nchw88_direct = "F16_CONV_NCHW88_DIRECT";
benchmark_with_contrast(
args_nchw88, algo_name_nchw88, data_type_fp16, args_nchw88,
algo_name_nchw88_direct, data_type_fp16, RUNS, {1, {4}});
}
TEST_F(AARCH64, BENCHMARK_CONVBIAS_STRIDE2_FP32_FP16) {
benchmarker_conv_bias(
get_conv_bias_benchmaker_args({2, 3, 5, 7}, 2), handle(), "ARMV8F32STRD2",
......
......@@ -69,86 +69,6 @@ void benchmark_impl(
}
}
void benchmark_with_contrast(
const std::vector<conv_bias::TestArg>& args, const std::string algo_name,
std::vector<DType>& data_type,
const std::vector<conv_bias::TestArg>& args_contrast,
const std::string algo_name_contrast, std::vector<DType>& data_type_contrast,
size_t RUNS, TaskExecutorConfig&& single_thread_config) {
auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
auto benchmarker_contrast = Benchmarker<ConvBias>(single_thread_handle.get());
benchmarker.set_times(RUNS)
.set_display(false)
.set_dtype(0, data_type[0])
.set_dtype(1, data_type[1])
.set_dtype(2, data_type[2])
.set_dtype(4, data_type[3])
.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
benchmarker_contrast.set_times(RUNS)
.set_display(false)
.set_dtype(0, data_type_contrast[0])
.set_dtype(1, data_type_contrast[1])
.set_dtype(2, data_type_contrast[2])
.set_dtype(4, data_type_contrast[3])
.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
algo_name_contrast.c_str()));
size_t arg_size = args.size(), arg_contrast_size = args_contrast.size();
megdnn_assert(arg_size == arg_contrast_size);
rep(i, arg_size) {
TensorLayout dst_layout, dst_layout_contrast;
auto opr = single_thread_handle.get()->create_operator<ConvBias>();
auto&& arg = args[i];
opr->param() = arg.param;
opr->deduce_layout(
{arg.src, data_type[0]}, {arg.filter, data_type[1]},
{arg.bias, data_type[2]}, {}, dst_layout);
float computation = (dst_layout.total_nr_elems() * arg.filter[1] *
arg.filter[2] * arg.filter[3] * arg.filter[4] * 2.0) /
(1024 * 1024 * 1024) * 1e3;
benchmarker.set_param(arg.param);
auto used = benchmarker.exec({arg.src, arg.filter, arg.bias, {}, {}}) / RUNS;
auto&& arg_contrast = args_contrast[i];
opr->param() = arg_contrast.param;
opr->deduce_layout(
{arg_contrast.src, data_type_contrast[0]},
{arg_contrast.filter, data_type_contrast[1]},
{arg_contrast.bias, data_type_contrast[2]}, {}, dst_layout_contrast);
float computation_contrast =
(dst_layout_contrast.total_nr_elems() * arg_contrast.filter[1] *
arg_contrast.filter[2] * arg_contrast.filter[3] *
arg_contrast.filter[4] * 2.0) /
(1024 * 1024 * 1024) * 1e3;
benchmarker_contrast.set_param(arg_contrast.param);
auto used_contrast = benchmarker_contrast.exec(
{arg_contrast.src,
arg_contrast.filter,
arg_contrast.bias,
{},
{}}) /
RUNS;
printf("Bench case: \n");
printf("padding: %u, stride: %u, nonline mode: %u\n", arg.param.pad_h,
arg.param.stride_h, arg.param.nonlineMode);
printf("%s %s %s\n", arg.src.to_string().c_str(),
arg.filter.to_string().c_str(), arg.bias.to_string().c_str());
printf("%s %s %s\n", arg_contrast.src.to_string().c_str(),
arg_contrast.filter.to_string().c_str(),
arg_contrast.bias.to_string().c_str());
printf("%s: %f gflops;\n%s: %f gflops\n"
"spead up = %f\n",
algo_name.c_str(), computation / used, algo_name_contrast.c_str(),
computation_contrast / used_contrast, used_contrast / used);
}
}
} // namespace
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
......
......@@ -1116,6 +1116,93 @@ void benchmark_winograd_compare(
used_winograd2 / used_winograd1);
}
}
void benchmark_with_contrast(
const std::vector<conv_bias::TestArg>& args, const std::string algo_name,
std::vector<DType>& data_type,
const std::vector<conv_bias::TestArg>& args_contrast,
const std::string algo_name_contrast, std::vector<DType>& data_type_contrast,
size_t RUNS, TaskExecutorConfig&& single_thread_config) {
using NLMode = param::ConvBias::NonlineMode;
std::map<NLMode, std::string> nonlinemode2string{
{NLMode::IDENTITY, "Identity"},
{NLMode::RELU, "ReLU"},
{NLMode::SIGMOID, "Sigmoid"},
{NLMode::H_SWISH, "H_Swish"}};
auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config);
auto benchmarker = Benchmarker<ConvBias>(single_thread_handle.get());
auto benchmarker_contrast = Benchmarker<ConvBias>(single_thread_handle.get());
benchmarker.set_times(RUNS)
.set_display(false)
.set_dtype(0, data_type[0])
.set_dtype(1, data_type[1])
.set_dtype(2, data_type[2])
.set_dtype(4, data_type[3])
.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name.c_str()));
benchmarker_contrast.set_times(RUNS)
.set_display(false)
.set_dtype(0, data_type_contrast[0])
.set_dtype(1, data_type_contrast[1])
.set_dtype(2, data_type_contrast[2])
.set_dtype(4, data_type_contrast[3])
.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
algo_name_contrast.c_str()));
size_t arg_size = args.size(), arg_contrast_size = args_contrast.size();
megdnn_assert(arg_size == arg_contrast_size);
rep(i, arg_size) {
TensorLayout dst_layout, dst_layout_contrast;
auto opr = single_thread_handle.get()->create_operator<ConvBias>();
auto&& arg = args[i];
opr->param() = arg.param;
opr->deduce_layout(
{arg.src, data_type[0]}, {arg.filter, data_type[1]},
{arg.bias, data_type[2]}, {}, dst_layout);
float computation = (dst_layout.total_nr_elems() * arg.filter[1] *
arg.filter[2] * arg.filter[3] * arg.filter[4] * 2.0) /
(1024 * 1024 * 1024) * 1e3;
benchmarker.set_param(arg.param);
auto used = benchmarker.exec({arg.src, arg.filter, arg.bias, {}, {}}) / RUNS;
auto&& arg_contrast = args_contrast[i];
opr->param() = arg_contrast.param;
opr->deduce_layout(
{arg_contrast.src, data_type_contrast[0]},
{arg_contrast.filter, data_type_contrast[1]},
{arg_contrast.bias, data_type_contrast[2]}, {}, dst_layout_contrast);
float computation_contrast =
(dst_layout_contrast.total_nr_elems() * arg_contrast.filter[1] *
arg_contrast.filter[2] * arg_contrast.filter[3] *
arg_contrast.filter[4] * 2.0) /
(1024 * 1024 * 1024) * 1e3;
benchmarker_contrast.set_param(arg_contrast.param);
auto used_contrast = benchmarker_contrast.exec(
{arg_contrast.src,
arg_contrast.filter,
arg_contrast.bias,
{},
{}}) /
RUNS;
printf("Bench case: \n");
printf("padding: %u, stride: %u, nonline mode: %s\n", arg.param.pad_h,
arg.param.stride_h, nonlinemode2string[arg.param.nonlineMode].c_str());
printf("%s %s %s\n", arg.src.to_string().c_str(),
arg.filter.to_string().c_str(), arg.bias.to_string().c_str());
printf("%s %s %s\n", arg_contrast.src.to_string().c_str(),
arg_contrast.filter.to_string().c_str(),
arg_contrast.bias.to_string().c_str());
printf("%s: %f gflops;\n%s: %f gflops\n"
"spead up = %f\n",
algo_name.c_str(), computation / used, algo_name_contrast.c_str(),
computation_contrast / used_contrast, used_contrast / used);
}
}
#endif // MEGDNN_WITH_BENCHMARK
template <class Checker>
......
......@@ -72,6 +72,12 @@ void benchmark_winograd_weight_preprocess(
void benchmark_winograd_compare(
const char* algoA_name, const char* algoB_name, megdnn::Handle* handle,
size_t kernel, size_t pack_size = 1, size_t io_pack_size = 1);
void benchmark_with_contrast(
const std::vector<conv_bias::TestArg>& args, const std::string algo_name,
std::vector<DType>& data_type,
const std::vector<conv_bias::TestArg>& args_contrast,
const std::string algo_name_contrast, std::vector<DType>& data_type_contrast,
size_t RUNS, TaskExecutorConfig&& single_thread_config);
#endif // MEGDNN_WITH_BENCHMARK
template <class Checker>
void check_winograd(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册