diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp index 54bbbd8c50389965af945bcab51d55b3a6a5831c..55994ff72e19df665a39f0012d85b0b90f83a436 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp @@ -186,14 +186,25 @@ bool ConvBiasImpl::AlgoConv1x1::usable( #if MEGDNN_AARCH64 || MEGDNN_ARMV7 if (format != param::ConvBias::Format::NCHW && format != param::ConvBias::Format::NCHW44 && - format != param::ConvBias::Format::NCHW44_DOT) { + format != param::ConvBias::Format::NCHW44_DOT && + format != param::ConvBias::Format::NCHW88) { return false; } //! hybird mode is not support if (param.filter_meta.format == param::ConvBias::Format::NCHW44 || param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { - if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || - param.filter_meta.ocpg == 1) { + if (param.filter_meta.icpg < 4_z || param.filter_meta.ocpg == 1) { + return false; + } + } + if (format == param::ConvBias::Format::NCHW88) { + bool is_packmode_not_default = + (m_matmul_algo->packmode() != + MatrixMulImpl::AlgoBase::PackMode::DEFAULT); + //! nchw88 hybrid mode and channel wise is not support + bool is_hybrid_mode_or_channel_wise = + (param.filter_meta.icpg < 8_z || param.filter_meta.ocpg == 1); + if (is_packmode_not_default || is_hybrid_mode_or_channel_wise) { return false; } } diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp index e909443063f2d33d9c5ba1183140f59617b92857..08d463baf6b1242ae922b8e22a5923d97a33a019 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp @@ -38,6 +38,8 @@ MatrixMulImpl::KernSizeParam get_matmul_kern_param( format = param::MatrixMul::Format::MK4; } else if (param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT) { format = param::MatrixMul::Format::MK4_DOT; + } else if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { + format = param::MatrixMul::Format::MK8; } return {param.filter_type, diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index 87b49b007bedfbca605ed977f035314d8dffe45f..399b9d24535dab6a52318782ca2bbf0db67e8280 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -695,7 +695,7 @@ size_t ConvBiasImpl::NCBKernParam::filter_offset( (group % 8 == 0 && icpg == 1 && ocpg == 1 && pack_group_size > 1) || (group == 1 && ocpg % 8 == 0), - "The filter shepe is not right of nchw88"); + "The filter shape is not right of nchw88"); group_offset = pack_group_size * group_pack_id * filter_meta.icpg * filter_meta.ocpg * filter_meta.spatial[0] * filter_meta.spatial[1] * filter_type.size(); @@ -717,7 +717,7 @@ size_t ConvBiasImpl::NCBKernParam::filter_offset( (group % 4 == 0 && icpg == 1 && ocpg == 1 && pack_group_size > 1) || (group == 1 && ocpg % 4 == 0), - "The filter shepe is not right of nchw44"); + "The filter shape is not right of nchw44"); group_offset = pack_group_size * group_pack_id * filter_meta.icpg * filter_meta.ocpg * filter_meta.spatial[0] * filter_meta.spatial[1] * filter_type.size(); diff --git a/dnn/test/aarch64/conv_bias.cpp b/dnn/test/aarch64/conv_bias.cpp index 791fc91d049dd910533bddc5ade0016a923faf47..4b0750abb116f5f679f3af21fff45594ec3658be 100644 --- a/dnn/test/aarch64/conv_bias.cpp +++ b/dnn/test/aarch64/conv_bias.cpp @@ -85,25 +85,6 @@ TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_RECORD) { } #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void checker_conv_bias_fp16( - std::vector args, Handle* handle, const char* algo_name, - float epsilon) { - using namespace conv_bias; - Checker checker(handle); - checker.set_before_exec_callback( - conv_bias::ConvBiasAlgoChecker(algo_name)); - checker.set_epsilon(epsilon); - checker.set_dtype(0, dtype::Float16()) - .set_dtype(1, dtype::Float16()) - .set_dtype(2, dtype::Float16()) - .set_dtype(4, dtype::Float16()); - NormalRNG rng(1.f); - checker.set_rng(0, &rng).set_rng(1, &rng); - - for (auto&& arg : args) { - checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); - } -} TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP16_STR2) { NormalRNG rng(1); @@ -111,6 +92,16 @@ TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_DIRECT_FP16_STR2) { conv_bias::get_conv_bias_args({2, 3, 5}, 2, false, false, false), handle(), rng, "ARMV8F16STRD2", 0.04); } + +TEST_F(AARCH64_MULTI_THREADS, CONVBIAS_CONV1x1_MATMUL_FP16_NCHW88) { + std::vector&& args_nchw88 = + conv_bias::get_nchw88_conv_bias_args( + {1}, QUAN_NLMODE, BR_AND_BIAS_BIASMODE, 1, 0); + + NormalRNG rng(1); + checker_conv_bias_f16( + args_nchw88, handle(), rng, "CONV1x1:AARCH64_F16_MK8_16X12X1", 0.03); +} #endif #if MEGDNN_WITH_BENCHMARK @@ -213,6 +204,47 @@ void benchmarker_conv_bias( } } +TEST_F(AARCH64, BENCHMARK_CONVBIAS_CONV1x1_MATMUL_VS_DIRECT_NCHW88) { + constexpr size_t RUNS = 50; + using NLMode = param::ConvBias::NonlineMode; + + std::vector args_nchw88; + auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS, + size_t group) { + param::ConvBias param_nchw88; + param_nchw88.format = param::ConvBias::Format::NCHW88; + for (size_t pad : {0}) { + for (size_t stride : {1}) { + for (auto nlmode : {NLMode::IDENTITY}) { + param_nchw88.nonlineMode = nlmode; + param_nchw88.pad_h = pad; + param_nchw88.pad_w = pad; + param_nchw88.stride_h = stride; + param_nchw88.stride_w = stride; + + args_nchw88.emplace_back( + param_nchw88, TensorShape{N, IC / 8, H, W, 8}, + TensorShape{OC / 8, IC / group / 8, FS, FS, 8, 8}, + TensorShape{1, OC / 8, 1, 1, 8}); + } + } + } + }; + std::vector data_type_fp16 = { + dtype::Float16(), dtype::Float16(), dtype::Float16(), dtype::Float16()}; + bench_case(1, 32, 64, 112, 112, 1, 1); + bench_case(1, 64, 128, 56, 56, 1, 1); + bench_case(1, 128, 256, 28, 28, 1, 1); + bench_case(1, 256, 512, 14, 14, 1, 1); + + std::string algo_name_nchw88 = "CONV1x1:AARCH64_F16_MK8_16X12X1"; + std::string algo_name_nchw88_direct = "F16_CONV_NCHW88_DIRECT"; + + benchmark_with_contrast( + args_nchw88, algo_name_nchw88, data_type_fp16, args_nchw88, + algo_name_nchw88_direct, data_type_fp16, RUNS, {1, {4}}); +} + TEST_F(AARCH64, BENCHMARK_CONVBIAS_STRIDE2_FP32_FP16) { benchmarker_conv_bias( get_conv_bias_benchmaker_args({2, 3, 5, 7}, 2), handle(), "ARMV8F32STRD2", diff --git a/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp b/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp index 48c03314466096ddd09920212664282c7b33cb3f..ab4c432f048e1883576112a7a3f9507399088142 100644 --- a/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp +++ b/dnn/test/arm_common/conv_bias_multi_thread_benchmark.cpp @@ -69,86 +69,6 @@ void benchmark_impl( } } -void benchmark_with_contrast( - const std::vector& args, const std::string algo_name, - std::vector& data_type, - const std::vector& args_contrast, - const std::string algo_name_contrast, std::vector& data_type_contrast, - size_t RUNS, TaskExecutorConfig&& single_thread_config) { - auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config); - - auto benchmarker = Benchmarker(single_thread_handle.get()); - auto benchmarker_contrast = Benchmarker(single_thread_handle.get()); - - benchmarker.set_times(RUNS) - .set_display(false) - .set_dtype(0, data_type[0]) - .set_dtype(1, data_type[1]) - .set_dtype(2, data_type[2]) - .set_dtype(4, data_type[3]) - .set_before_exec_callback( - conv_bias::ConvBiasAlgoChecker(algo_name.c_str())); - benchmarker_contrast.set_times(RUNS) - .set_display(false) - .set_dtype(0, data_type_contrast[0]) - .set_dtype(1, data_type_contrast[1]) - .set_dtype(2, data_type_contrast[2]) - .set_dtype(4, data_type_contrast[3]) - .set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( - algo_name_contrast.c_str())); - - size_t arg_size = args.size(), arg_contrast_size = args_contrast.size(); - megdnn_assert(arg_size == arg_contrast_size); - rep(i, arg_size) { - TensorLayout dst_layout, dst_layout_contrast; - auto opr = single_thread_handle.get()->create_operator(); - - auto&& arg = args[i]; - opr->param() = arg.param; - opr->deduce_layout( - {arg.src, data_type[0]}, {arg.filter, data_type[1]}, - {arg.bias, data_type[2]}, {}, dst_layout); - float computation = (dst_layout.total_nr_elems() * arg.filter[1] * - arg.filter[2] * arg.filter[3] * arg.filter[4] * 2.0) / - (1024 * 1024 * 1024) * 1e3; - benchmarker.set_param(arg.param); - auto used = benchmarker.exec({arg.src, arg.filter, arg.bias, {}, {}}) / RUNS; - - auto&& arg_contrast = args_contrast[i]; - opr->param() = arg_contrast.param; - opr->deduce_layout( - {arg_contrast.src, data_type_contrast[0]}, - {arg_contrast.filter, data_type_contrast[1]}, - {arg_contrast.bias, data_type_contrast[2]}, {}, dst_layout_contrast); - float computation_contrast = - (dst_layout_contrast.total_nr_elems() * arg_contrast.filter[1] * - arg_contrast.filter[2] * arg_contrast.filter[3] * - arg_contrast.filter[4] * 2.0) / - (1024 * 1024 * 1024) * 1e3; - benchmarker_contrast.set_param(arg_contrast.param); - auto used_contrast = benchmarker_contrast.exec( - {arg_contrast.src, - arg_contrast.filter, - arg_contrast.bias, - {}, - {}}) / - RUNS; - - printf("Bench case: \n"); - printf("padding: %u, stride: %u, nonline mode: %u\n", arg.param.pad_h, - arg.param.stride_h, arg.param.nonlineMode); - printf("%s %s %s\n", arg.src.to_string().c_str(), - arg.filter.to_string().c_str(), arg.bias.to_string().c_str()); - printf("%s %s %s\n", arg_contrast.src.to_string().c_str(), - arg_contrast.filter.to_string().c_str(), - arg_contrast.bias.to_string().c_str()); - - printf("%s: %f gflops;\n%s: %f gflops\n" - "spead up = %f\n", - algo_name.c_str(), computation / used, algo_name_contrast.c_str(), - computation_contrast / used_contrast, used_contrast / used); - } -} } // namespace #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index a09ebd44224c3a00cb3a40361441c94503472db7..3b865828caf1e33278a8b3cb05ec9f3ab0f0e7e3 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -1116,6 +1116,93 @@ void benchmark_winograd_compare( used_winograd2 / used_winograd1); } } + +void benchmark_with_contrast( + const std::vector& args, const std::string algo_name, + std::vector& data_type, + const std::vector& args_contrast, + const std::string algo_name_contrast, std::vector& data_type_contrast, + size_t RUNS, TaskExecutorConfig&& single_thread_config) { + using NLMode = param::ConvBias::NonlineMode; + std::map nonlinemode2string{ + {NLMode::IDENTITY, "Identity"}, + {NLMode::RELU, "ReLU"}, + {NLMode::SIGMOID, "Sigmoid"}, + {NLMode::H_SWISH, "H_Swish"}}; + auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config); + + auto benchmarker = Benchmarker(single_thread_handle.get()); + auto benchmarker_contrast = Benchmarker(single_thread_handle.get()); + + benchmarker.set_times(RUNS) + .set_display(false) + .set_dtype(0, data_type[0]) + .set_dtype(1, data_type[1]) + .set_dtype(2, data_type[2]) + .set_dtype(4, data_type[3]) + .set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo_name.c_str())); + benchmarker_contrast.set_times(RUNS) + .set_display(false) + .set_dtype(0, data_type_contrast[0]) + .set_dtype(1, data_type_contrast[1]) + .set_dtype(2, data_type_contrast[2]) + .set_dtype(4, data_type_contrast[3]) + .set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + algo_name_contrast.c_str())); + + size_t arg_size = args.size(), arg_contrast_size = args_contrast.size(); + megdnn_assert(arg_size == arg_contrast_size); + rep(i, arg_size) { + TensorLayout dst_layout, dst_layout_contrast; + auto opr = single_thread_handle.get()->create_operator(); + + auto&& arg = args[i]; + opr->param() = arg.param; + opr->deduce_layout( + {arg.src, data_type[0]}, {arg.filter, data_type[1]}, + {arg.bias, data_type[2]}, {}, dst_layout); + float computation = (dst_layout.total_nr_elems() * arg.filter[1] * + arg.filter[2] * arg.filter[3] * arg.filter[4] * 2.0) / + (1024 * 1024 * 1024) * 1e3; + benchmarker.set_param(arg.param); + auto used = benchmarker.exec({arg.src, arg.filter, arg.bias, {}, {}}) / RUNS; + + auto&& arg_contrast = args_contrast[i]; + opr->param() = arg_contrast.param; + opr->deduce_layout( + {arg_contrast.src, data_type_contrast[0]}, + {arg_contrast.filter, data_type_contrast[1]}, + {arg_contrast.bias, data_type_contrast[2]}, {}, dst_layout_contrast); + float computation_contrast = + (dst_layout_contrast.total_nr_elems() * arg_contrast.filter[1] * + arg_contrast.filter[2] * arg_contrast.filter[3] * + arg_contrast.filter[4] * 2.0) / + (1024 * 1024 * 1024) * 1e3; + benchmarker_contrast.set_param(arg_contrast.param); + auto used_contrast = benchmarker_contrast.exec( + {arg_contrast.src, + arg_contrast.filter, + arg_contrast.bias, + {}, + {}}) / + RUNS; + + printf("Bench case: \n"); + printf("padding: %u, stride: %u, nonline mode: %s\n", arg.param.pad_h, + arg.param.stride_h, nonlinemode2string[arg.param.nonlineMode].c_str()); + printf("%s %s %s\n", arg.src.to_string().c_str(), + arg.filter.to_string().c_str(), arg.bias.to_string().c_str()); + printf("%s %s %s\n", arg_contrast.src.to_string().c_str(), + arg_contrast.filter.to_string().c_str(), + arg_contrast.bias.to_string().c_str()); + + printf("%s: %f gflops;\n%s: %f gflops\n" + "spead up = %f\n", + algo_name.c_str(), computation / used, algo_name_contrast.c_str(), + computation_contrast / used_contrast, used_contrast / used); + } +} #endif // MEGDNN_WITH_BENCHMARK template diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h index b44d0293609767523ebbb506f2d3fcd4e466ea5d..82b4789e183031c512b890811ed2ebb59e2cac13 100644 --- a/dnn/test/common/conv_bias.h +++ b/dnn/test/common/conv_bias.h @@ -72,6 +72,12 @@ void benchmark_winograd_weight_preprocess( void benchmark_winograd_compare( const char* algoA_name, const char* algoB_name, megdnn::Handle* handle, size_t kernel, size_t pack_size = 1, size_t io_pack_size = 1); +void benchmark_with_contrast( + const std::vector& args, const std::string algo_name, + std::vector& data_type, + const std::vector& args_contrast, + const std::string algo_name_contrast, std::vector& data_type_contrast, + size_t RUNS, TaskExecutorConfig&& single_thread_config); #endif // MEGDNN_WITH_BENCHMARK template void check_winograd(