#include "test/common/conv_bias.h" #include "megdnn/opr_param_defs.h" #include "megdnn/oprs.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/rng.h" #include "test/common/task_record_check.h" #include "test/common/tensor.h" #include "test/fallback/fixture.h" #if MEGDNN_X86 #include "src/x86/utils.h" #endif namespace megdnn { namespace test { TEST_F(FALLBACK, CONV_BIAS_FORWARD) { using namespace conv_bias; std::vector args = get_args(); Checker checker(handle()); NormalRNG default_rng; UniformIntRNG int_rng{-50, 50}; param::ConvBias param; { param.format = param::ConvBias::Format::NHWC; auto src_shape = TensorShape{2, 16, 32, 24}; auto filter_shape = TensorShape{4, 3, 3, 24}; auto bias_shape_channel = TensorShape{1, 1, 1, 4}; checker.set_dtype(0, dtype::Float32()) .set_dtype(1, dtype::Float32()) .set_dtype(2, dtype::Float32()) .set_rng(0, &default_rng) .set_rng(1, &default_rng) .set_rng(2, &default_rng) .set_param(param) .execs({src_shape, filter_shape, bias_shape_channel, {}, {}}); } checker.set_before_exec_callback( conv_bias::ConvBiasAlgoChecker("FALLBACK_NAIVE")); for (auto&& arg : args) { checker.set_dtype(0, dtype::Float32()) .set_dtype(1, dtype::Float32()) .set_dtype(2, dtype::Float32()) .set_rng(0, &default_rng) .set_rng(1, &default_rng) .set_rng(2, &default_rng) .set_epsilon(1e-3) .set_param(arg.param) .execs({arg.src, arg.filter, arg.bias, {}, {}}); } { param.format = param::ConvBias::Format::NCHW; param.sparse = ConvBias::Param::Sparse::GROUP; auto src_shape = TensorShape{2, 16, 32, 24}; auto filter_shape = TensorShape{4, 4, 4, 1, 1}; auto bias_shape_channel = TensorShape{1, 16, 1, 1}; auto bias_shape = TensorShape{2, 16, 32, 24}; checker.set_dtype(0, dtype::Float32()) .set_dtype(1, dtype::Float32()) .set_dtype(2, dtype::Float32()) .set_rng(0, &default_rng) .set_rng(1, &default_rng) .set_rng(2, &default_rng) .set_param(param) .execs({src_shape, filter_shape, bias_shape, {}, {}}) .execs({src_shape, filter_shape, bias_shape_channel, {}, {}}); } } TEST_F(FALLBACK, CONV_BIAS_FORWARD_RECORD) { using namespace conv_bias; TaskRecordChecker checker(1); NormalRNG default_rng; UniformIntRNG int_rng{-50, 50}; param::ConvBias param; { param.format = param::ConvBias::Format::NHWC; auto src_shape = TensorShape{2, 16, 32, 24}; auto filter_shape = TensorShape{4, 3, 3, 24}; auto bias_shape_channel = TensorShape{1, 1, 1, 4}; checker.set_dtype(0, dtype::Float32()) .set_dtype(1, dtype::Float32()) .set_dtype(2, dtype::Float32()) .set_rng(0, &default_rng) .set_rng(1, &default_rng) .set_rng(2, &default_rng) .set_param(param) .execs({src_shape, filter_shape, bias_shape_channel, {}, {}}); } { param.format = param::ConvBias::Format::NCHW; param.sparse = ConvBias::Param::Sparse::GROUP; auto src_shape = TensorShape{2, 16, 32, 24}; auto filter_shape = TensorShape{4, 4, 4, 1, 1}; auto bias_shape_channel = TensorShape{1, 16, 1, 1}; auto bias_shape = TensorShape{2, 16, 32, 24}; checker.set_dtype(0, dtype::Float32()) .set_dtype(1, dtype::Float32()) .set_dtype(2, dtype::Float32()) .set_rng(0, &default_rng) .set_rng(1, &default_rng) .set_rng(2, &default_rng) .set_param(param) .execs({src_shape, filter_shape, bias_shape, {}, {}}) .execs({src_shape, filter_shape, bias_shape_channel, {}, {}}); } } TEST_F(FALLBACK, FP32_GEMV_MK4_GI) { Checker checker(handle()); using Param = MatrixMul::Param; checker.set_before_exec_callback(AlgoChecker("FB_GI_F32_GEMV_MK4")); checker.set_epsilon(1e-2); auto run = [&](size_t M, size_t K) { Param param; param.format = param::MatrixMul::Format::MK4; param.transposeA = false; param.transposeB = false; TensorShape A, B; A = TensorShape{M / 4, K / 4, 4, 4}; B = TensorShape{K / 4, 1, 4}; checker.set_param(param).execs({A, B, {}}); }; // N = 1 for (size_t M : {4, 16, 128, 1024}) for (size_t K : {4, 8, 12, 128, 256, 4096}) run(M, K); } std::vector get_conv_bias_args( std::vector kernel, std::vector padv, std::vector nlmodev, std::vector stridev, bool no_bias, bool only_broadbias) { using namespace conv_bias; using Param = param::ConvBias; using NLMode = param::ConvBias::NonlineMode; std::vector args; auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad, size_t kernel, size_t stride, NLMode nonlinemode) { Param param; param.stride_h = stride; param.stride_w = stride; param.pad_h = pad; param.pad_w = pad; param.nonlineMode = nonlinemode; args.emplace_back( param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, TensorShape{}); if (!no_bias) { args.emplace_back( param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, TensorShape{1, oc, 1, 1}); if (!only_broadbias) { args.emplace_back( param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, kernel, kernel}, TensorShape{ n, oc, (h + 2 * param.pad_h - kernel) / stride + 1, (w + 2 * param.pad_h - kernel) / stride + 1}); } } }; auto pack_group = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, size_t pad, size_t kernel, size_t stride, NLMode nonlinemode) { Param param; param.stride_h = stride; param.stride_w = stride; param.pad_h = pad; param.pad_w = pad; param.nonlineMode = nonlinemode; param.sparse = param::ConvBias::Sparse::GROUP; args.emplace_back( param, TensorShape{n, 2 * ic, h, w}, TensorShape{2, oc, ic, kernel, kernel}, TensorShape{}); if (!no_bias) { args.emplace_back( param, TensorShape{n, 2 * ic, h, w}, TensorShape{2, oc, ic, kernel, kernel}, TensorShape{1, oc * 2, 1, 1}); if (!only_broadbias) { args.emplace_back( param, TensorShape{n, 2 * ic, h, w}, TensorShape{2, oc, ic, kernel, kernel}, TensorShape{ n, 2 * oc, (h + 2 * param.pad_h - kernel) / stride + 1, (w + 2 * param.pad_h - kernel) / stride + 1}); } } }; for (size_t n : {1, 2}) { for (auto nlmode : nlmodev) { for (auto pad : padv) { for (auto stride : stridev) { for (size_t ic : {1, 5}) { for (size_t oc : {1, 11}) { for (size_t size : {9, 30}) { for (size_t kern : kernel) { pack(n, oc, ic, size + 4, size + 4, pad, kern, stride, nlmode); pack_group( n, oc, ic, size, size, pad, kern, stride, nlmode); } } } } } } } } return args; } void checker_conv_bias( std::vector args, Handle* handle, RNG* rng, float epsilon, DType type0, DType type1, DType type2, DType type3, const char* algo_name) { using namespace conv_bias; Checker checker(handle); checker.set_before_exec_callback( conv_bias::ConvBiasAlgoChecker(algo_name)); checker.set_dtype(0, type0); checker.set_dtype(1, type1); checker.set_dtype(2, type2); checker.set_dtype(4, type3); checker.set_epsilon(epsilon); if (NULL != rng) { checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng); } for (auto&& arg : args) { checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); } } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true); check_conv_bias(args, handle(), "CONV1x1:FB_GI_F32_MK4_PACK_4x12:24"); } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32_PREPROCESS) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_NO_BIASMODE, 1); #define cb(name) \ check_conv_bias_preprocess( \ args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \ dtype::Float32(), dtype::Float32(), name); cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12"); #undef cb } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({3}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2); #define cb(name) \ check_conv_bias_preprocess( \ args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \ dtype::Float32(), dtype::Float32(), name); cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12"); #undef cb } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_1X1_S1_MK4_PACK_F32_PREPROCESS) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({1}, FULL_NLMODE, ALL_BIASMODE, 1, true); #define cb(name) \ check_conv_bias_preprocess( \ args, handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \ dtype::Float32(), dtype::Float32(), name); cb("CONV1x1:FB_GI_F32_MK4_PACK_4x12:24"); #undef cb } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S1_MK4_PACK_F32) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({2, 4, 7}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 1); check_conv_bias(args, handle(), "IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12"); } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({3, 5, 6}, FULL_NLMODE, BR_AND_BIAS_BIASMODE, 2); #define cb(name) check_conv_bias(args, handle(), name); cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12"); #undef cb } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_GI_IM2COL_S2_MK4_PACK_F32_FUSE) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({3}, FULL_NLMODE, ALL_BIASMODE, 2); #define cb(name) check_conv_bias(args, handle(), name); cb("IM2COLMATMUL:FB_GI_F32_MK4_PACK_4x12"); #undef cb } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_IM2COL_8X8X16) { using namespace conv_bias; param::ConvBias cur_param; using NLMode = param::ConvBias::NonlineMode; std::vector args = get_conv_bias_args( {1, 3}, {0}, {NLMode::IDENTITY, NLMode::RELU}, {1}, false, true); NormalRNG default_rng; Checker checker(handle()); checker.set_dtype(0, dtype::Int8{}); checker.set_dtype(1, dtype::Int8{}); checker.set_dtype(2, dtype::Int16{}); checker.set_dtype(4, dtype::Int16{}); for (auto&& arg : args) { checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); } } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD) { using namespace conv_bias; param::ConvBias cur_param; using NLMode = param::ConvBias::NonlineMode; std::vector args = get_conv_bias_args( {1, 3, 5}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::SIGMOID, NLMode::RELU}, {1, 2}, false, false); NormalRNG default_rng; checker_conv_bias( args, handle(), &default_rng, 1e-3, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, "FALLBACK_NAIVE"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2) { check_conv_bias( conv_bias::get_nchw44_conv_bias_args( {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 2, false, true), handle(), "F32_CONV_NCHW_NCHW44"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) { check_conv_bias( conv_bias::get_nchw44_conv_bias_args( {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, ONLY_BR_BIASMODE, 1, false, true), handle(), "F32_CONV_NCHW_NCHW44"); } std::vector get_nchw44_channel_wise_args( std::vector kernel, size_t stride, bool no_bias, bool no_nonlinemode, bool no_full_bias) { using namespace conv_bias; using Param = param::ConvBias; using NLMode = param::ConvBias::NonlineMode; std::vector args; auto pack = [&](size_t n, size_t group, size_t w, size_t h, size_t kernel, size_t stride, NLMode nlmode, bool pad) { Param param; param.stride_h = stride; param.stride_w = stride; if (pad) { param.pad_h = kernel / 2; param.pad_w = kernel / 2; } else { param.pad_h = 0; param.pad_w = 0; } param.nonlineMode = nlmode; param.format = param::ConvBias::Format::NCHW44; param.sparse = param::ConvBias::Sparse::GROUP; args.emplace_back( param, TensorShape{n, group, h, w, 4}, TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{}); if (!no_bias) { args.emplace_back( param, TensorShape{n, group, h, w, 4}, TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{1, group, 1, 1, 4}); } if (!no_full_bias) { args.emplace_back( param, TensorShape{n, group, h, w, 4}, TensorShape{group, 1, 1, kernel, kernel, 4}, TensorShape{ n, group, (h + 2 * param.pad_w - kernel) / stride + 1, (w + 2 * param.pad_w - kernel) / stride + 1, 4}); } }; std::vector nonlinemode = {NLMode::IDENTITY}; if (!no_nonlinemode) { nonlinemode.emplace_back(NLMode::RELU); nonlinemode.emplace_back(NLMode::H_SWISH); } for (size_t n : {1, 2}) { for (auto nlmode : nonlinemode) { for (bool pad : {true}) { for (size_t group : {1, 2, 4, 7, 16}) { for (size_t size : {4, 6, 7, 9, 20}) { for (size_t kern : kernel) { pack(n, group, size, size, kern, stride, nlmode, pad); } } } } for (bool pad : {false}) { for (size_t group : {1, 2, 7, 16}) { for (size_t size : {7, 9, 20}) { for (size_t kern : kernel) { pack(n, group, size, size, kern, stride, nlmode, pad); } } } } } } return args; } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_1) { check_conv_bias( get_nchw44_channel_wise_args({2, 3}, 1, false, false, false), handle(), "F32_CHANNEL_WISE_NCHW44"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE1_FP32_NCHW44_2) { check_conv_bias( get_nchw44_channel_wise_args({5}, 1, false, false, false), handle(), "F32_CHANNEL_WISE_NCHW44"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_CHANNEL_WISE_STRIDE2_FP32_NCHW44) { check_conv_bias( get_nchw44_channel_wise_args({2, 3, 5}, 2, false, false, false), handle(), "F32_CHANNEL_WISE_NCHW44"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K7) { //! k=7 s=1 check_conv_bias( conv_bias::get_nchw44_conv_bias_args( {7}, ONLY_IDENTITY_NLMODE, BR_AND_NO_BIASMODE, 1), handle(), "F32_CONV_NCHW44_DIRECT"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K2K3) { check_conv_bias( conv_bias::get_nchw44_conv_bias_args( {2, 3}, FULL_NLMODE, ONLY_BR_BIASMODE, 1), handle(), "F32_CONV_NCHW44_DIRECT"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S1_K5) { check_conv_bias( conv_bias::get_nchw44_conv_bias_args({5}, FULL_NLMODE, ONLY_BR_BIASMODE, 1), handle(), "F32_CONV_NCHW44_DIRECT"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_NCHW44_S2) { check_conv_bias( conv_bias::get_nchw44_conv_bias_args( {2, 3, 5, 7}, FULL_NLMODE, ONLY_BR_BIASMODE, 2), handle(), "F32_CONV_NCHW44_DIRECT"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32) { check_conv_bias( conv_bias::get_conv_bias_args( {1, 2, 3, 4, 5, 6, 7}, 1, false, false, false), handle(), "F32DIRECT"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR2) { check_conv_bias( conv_bias::get_conv_bias_args({2, 3, 5, 7}, 2, false, false, false), handle(), "F32STRD2"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_DIRECT_FP32_STR1) { check_conv_bias( conv_bias::get_conv_bias_args({2, 3, 5, 7}, 1, false, false, false), handle(), "F32STRD1"); } TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_PREPROCESS_NCHW44) { using namespace conv_bias; std::vector nchw44_args = conv_bias::get_nchw44_conv_bias_args( {3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1); Checker checker(handle()); auto run = [&checker]( const std::vector& args, DType A_dtype, DType B_dtype, DType C_dtype, DType D_dtype, const float eps) { for (auto&& arg : args) { checker.set_dtype(0, A_dtype) .set_dtype(1, B_dtype) .set_dtype(2, C_dtype) .set_dtype(4, D_dtype) .set_epsilon(eps) .set_param(arg.param) .execs({arg.src, arg.filter, arg.bias, {}, {}}); } }; //! uncomment this when low precision mode is ok // run(handle(), nchw44_args, {2, 6, 7}, dtype::Float32(), dtype::Float32(), // dtype::Float32(), dtype::Float32(), 1e-2f); //! remove this when low precision mode is ok run(nchw44_args, dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32(), 1e-3f); } TEST_F(FALLBACK_MULTI_THREADS, CONV_BIAS_FORWARD_QUANTIZED) { using namespace conv_bias; param::ConvBias cur_param; using NLMode = param::ConvBias::NonlineMode; std::vector args = get_conv_bias_args( {1, 3, 5, 7}, {0, 3}, {NLMode::IDENTITY, NLMode::H_SWISH, NLMode::RELU}, {1, 2}, false, false); UniformIntRNG int_rng{-50, 50}; float epsilon = 1e-3; checker_conv_bias( args, handle(), &int_rng, epsilon, dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), "FALLBACK_NAIVE"); } #if MEGDNN_WITH_BENCHMARK namespace { void benchmark_impl( const param::ConvBias param, std::vector, float>>& shapes_and_computation, const std::string algo_name, size_t RUNS, TaskExecutorConfig&& multi_thread_config, TaskExecutorConfig&& single_thread_config, std::vector& data_type) { std::vector multi_thread_times, single_thread_times; { auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config); auto benchmarker = Benchmarker(multi_thread_hanle.get()); benchmarker.set_times(RUNS) .set_display(false) .set_param(param) .set_dtype(0, data_type[0]) .set_dtype(1, data_type[1]) .set_dtype(2, data_type[2]) .set_dtype(4, data_type[3]) .set_before_exec_callback( conv_bias::ConvBiasAlgoChecker(algo_name.c_str())); for (auto shape : shapes_and_computation) { multi_thread_times.push_back(benchmarker.exec(shape.first) / RUNS); } } { auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config); auto benchmarker = Benchmarker(single_thread_handle.get()); benchmarker.set_times(RUNS) .set_display(false) .set_param(param) .set_dtype(0, data_type[0]) .set_dtype(1, data_type[1]) .set_dtype(2, data_type[2]) .set_dtype(4, data_type[3]) .set_before_exec_callback( conv_bias::ConvBiasAlgoChecker(algo_name.c_str())); for (auto shape : shapes_and_computation) { single_thread_times.push_back(benchmarker.exec(shape.first) / RUNS); } } printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread); printf("core_ids:"); for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) { printf("%zu ", multi_thread_config.affinity_core_set[i]); } printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]); for (size_t i = 0; i < shapes_and_computation.size(); i++) { auto shapes = shapes_and_computation[i]; printf("Bench case: "); for (auto&& shape : shapes.first) { printf("%s ", shape.to_string().c_str()); } float computations = shapes.second; printf("%zu threads gflops: %f,\n single thread gflops: " "%f. spead up = %f, speedup/cores=%f\n", multi_thread_config.nr_thread, computations / multi_thread_times[i], computations / single_thread_times[i], single_thread_times[i] / multi_thread_times[i], single_thread_times[i] / multi_thread_times[i] / multi_thread_config.nr_thread); } } } // namespace TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32) { constexpr size_t RUNS = 50; param::ConvBias param; param.nonlineMode = param::ConvBias::NonlineMode::RELU; param.pad_h = 1; param.pad_w = 1; param.stride_h = 1; param.stride_w = 1; param.sparse = param::ConvBias::Sparse::GROUP; std::vector, float>> shapes_and_computation; auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS, size_t group) { SmallVector shapes{ {N, IC, H, W}, {group, OC / group, IC / group, FS, FS}, {1, OC, 1, 1}, {}, {N, OC, H, W}}; TensorShape dst{N, OC, H, W}; float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + dst.total_nr_elems()) * 1e-6; shapes_and_computation.push_back(std::make_pair(shapes, computations)); }; bench_case(1, 32, 32, 200, 200, 3, 4); bench_case(1, 32, 32, 200, 200, 3, 32); bench_case(1, 32, 32, 128, 128, 3, 4); bench_case(1, 32, 32, 128, 128, 3, 32); bench_case(1, 32, 32, 100, 100, 3, 4); bench_case(1, 32, 32, 100, 100, 3, 32); bench_case(1, 32, 32, 80, 80, 3, 4); bench_case(1, 32, 32, 80, 80, 3, 32); std::string algo_name = "F32DIRECT"; printf("Benchmark F32DIRECT_LARGE_GROUP algo\n"); std::vector data_type = { dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()}; benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}}, data_type); shapes_and_computation.clear(); algo_name = "F32DIRECT"; printf("Benchmark F32DIRECT_SMALL_GROUP algo\n"); bench_case(1, 32, 32, 200, 200, 3, 1); bench_case(1, 32, 32, 128, 128, 3, 1); bench_case(1, 32, 32, 100, 100, 3, 1); bench_case(1, 32, 32, 80, 80, 3, 1); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}}, data_type); } TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR1) { constexpr size_t RUNS = 50; param::ConvBias param; param.nonlineMode = param::ConvBias::NonlineMode::RELU; param.pad_h = 1; param.pad_w = 1; param.stride_h = 1; param.stride_w = 1; param.sparse = param::ConvBias::Sparse::GROUP; std::vector, float>> shapes_and_computation; auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS, size_t group) { SmallVector shapes{ {N, IC, H, W}, {group, OC / group, IC / group, FS, FS}, {1, OC, 1, 1}, {}, {N, OC, H, W}}; TensorShape dst{N, OC, H, W}; float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + dst.total_nr_elems()) * 1e-6; shapes_and_computation.push_back(std::make_pair(shapes, computations)); }; bench_case(1, 32, 32, 200, 200, 3, 4); bench_case(1, 32, 32, 200, 200, 3, 32); bench_case(1, 32, 32, 128, 128, 3, 4); bench_case(1, 32, 32, 128, 128, 3, 32); bench_case(1, 32, 32, 100, 100, 3, 4); bench_case(1, 32, 32, 100, 100, 3, 32); bench_case(1, 32, 32, 80, 80, 3, 4); bench_case(1, 32, 32, 80, 80, 3, 32); std::string algo_name = "F32STRD1"; printf("Benchmark F32STRD1_LARGE_GROUP algo\n"); std::vector data_type = { dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()}; benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}}, data_type); shapes_and_computation.clear(); algo_name = "F32STRD1"; printf("Benchmark F32STRD1_SMALL_GROUP algo\n"); bench_case(1, 32, 32, 200, 200, 3, 1); bench_case(1, 32, 32, 128, 128, 3, 1); bench_case(1, 32, 32, 100, 100, 3, 1); bench_case(1, 32, 32, 80, 80, 3, 1); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}}, data_type); } TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_GI_CONVBIAS_DIRECTF32_STR2) { constexpr size_t RUNS = 50; param::ConvBias param; param.nonlineMode = param::ConvBias::NonlineMode::RELU; param.pad_h = 1; param.pad_w = 1; param.stride_h = 2; param.stride_w = 2; param.sparse = param::ConvBias::Sparse::GROUP; std::vector, float>> shapes_and_computation; auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS, size_t group, size_t P, size_t S) { SmallVector shapes{ {N, IC, H, W}, {group, OC / group, IC / group, FS, FS}, {1, OC, 1, 1}, {}, {N, OC, (H + 2 * P - FS) / S + 1, (W + 2 * P - FS) / S + 1}}; TensorShape dst{N, OC, H, W}; float computations = ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + dst.total_nr_elems()) * 1e-6; shapes_and_computation.push_back(std::make_pair(shapes, computations)); }; bench_case(1, 32, 32, 200, 200, 3, 4, 1, 2); bench_case(1, 32, 32, 200, 200, 3, 32, 1, 2); bench_case(1, 32, 32, 128, 128, 3, 4, 1, 2); bench_case(1, 32, 32, 128, 128, 3, 32, 1, 2); bench_case(1, 32, 32, 100, 100, 3, 4, 1, 2); bench_case(1, 32, 32, 100, 100, 3, 32, 1, 2); bench_case(1, 32, 32, 80, 80, 3, 4, 1, 2); bench_case(1, 32, 32, 80, 80, 3, 32, 1, 2); std::string algo_name = "F32STRD2"; printf("Benchmark F32STRD2_LARGE_GROUP algo\n"); std::vector data_type = { dtype::Float32(), dtype::Float32(), dtype::Float32(), dtype::Float32()}; benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}}, data_type); shapes_and_computation.clear(); algo_name = "F32STRD2"; printf("Benchmark F32STRD2_SMALL_GROUP algo\n"); bench_case(1, 32, 32, 200, 200, 3, 1, 1, 2); bench_case(1, 32, 32, 128, 128, 3, 1, 1, 2); bench_case(1, 32, 32, 100, 100, 3, 1, 1, 2); bench_case(1, 32, 32, 80, 80, 3, 1, 1, 2); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {4, {4, 5, 6, 7}}, {1, {7}}, data_type); benchmark_impl( param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, {1, {4}}, data_type); } TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE1_NCHW44) { // have to remove preferred restrict in usable func before run the benchmark using namespace conv_bias; param::ConvBias param; param.stride_h = 1; param.stride_w = 1; param.pad_h = 1; param.pad_w = 1; param.nonlineMode = NonlineMode::RELU; param.sparse = param::ConvBias::Sparse::GROUP; constexpr size_t RUN = 50; Benchmarker benchmark0(handle()); benchmark0.set_display(false); benchmark0.set_param(param); benchmark0.set_times(RUN); benchmark0.set_before_exec_callback( conv_bias::ConvBiasAlgoChecker("F32STRD1")); auto opr = handle()->create_operator(); opr->param() = param; param.format = param::ConvBias::Format::NCHW44; Benchmarker benchmark1(handle()); benchmark1.set_display(false); benchmark1.set_param(param); benchmark1.set_times(RUN); benchmark1.set_before_exec_callback( conv_bias::ConvBiasAlgoChecker("F32_CHANNEL_WISE_NCHW44")); auto run = [&](size_t group, size_t w, size_t h, size_t kernel) { TensorLayout dst_layout; opr->deduce_layout( {{1, group * 4, h, w}, dtype::Int8()}, {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()}, {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout); //! dst.nr_elems * IC * FH * FW * 2 float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 / (1024 * 1024 * 1024) * 1e3; auto used0 = benchmark0.exec( {{1, group * 4, h, w}, {group * 4, 1, 1, kernel, kernel}, {1, group * 4, 1, 1}, {}, {}}) / RUN; auto used1 = benchmark1.exec( {{1, group, h, w, 4}, {group, 1, 1, kernel, kernel, 4}, {1, group, 1, 1, 4}, {}, {}}) / RUN; printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops " "nchw44: " "%f ms %f GFlops " "speedup: %f\n", group, h, w, kernel, used0, computations / used0, used1, computations / used1, used0 / used1); }; for (size_t group : {8, 16, 32, 64}) { for (size_t kerenl : {2, 3, 5}) { run(group, 112, 112, kerenl); run(group, 56, 56, kerenl); run(group, 48, 48, kerenl); run(group, 28, 28, kerenl); run(group, 14, 14, kerenl); } } run(8, 112, 112, 3); run(32, 56, 56, 3); run(64, 28, 28, 3); run(128, 14, 14, 3); } TEST_F(FALLBACK, BENCHMARK_GI_CHANNEL_WISE_F32_STRIDE2_NCHW44) { // have to remove preferred restrict in usable func before run the benchmark using namespace conv_bias; param::ConvBias param; param.stride_h = 2; param.stride_w = 2; param.pad_h = 1; param.pad_w = 1; param.nonlineMode = NonlineMode::RELU; param.sparse = param::ConvBias::Sparse::GROUP; constexpr size_t RUN = 50; Benchmarker benchmark0(handle()); benchmark0.set_display(false); benchmark0.set_param(param); benchmark0.set_times(RUN); benchmark0.set_before_exec_callback( conv_bias::ConvBiasAlgoChecker("F32STRD2")); auto opr = handle()->create_operator(); opr->param() = param; param.format = param::ConvBias::Format::NCHW44; Benchmarker benchmark1(handle()); benchmark1.set_display(false); benchmark1.set_param(param); benchmark1.set_times(RUN); benchmark1.set_before_exec_callback( conv_bias::ConvBiasAlgoChecker("F32_CHANNEL_WISE_NCHW44")); auto run = [&](size_t group, size_t w, size_t h, size_t kernel) { TensorLayout dst_layout; opr->deduce_layout( {{1, group * 4, h, w}, dtype::Int8()}, {{group * 4, 1, 1, kernel, kernel}, dtype::Int8()}, {{1, group * 4, 1, 1}, dtype::Int32()}, {}, dst_layout); //! dst.nr_elems * IC * FH * FW * 2 float computations = dst_layout.total_nr_elems() * kernel * kernel * 2.0 / (1024 * 1024 * 1024) * 1e3; auto used0 = benchmark0.exec( {{1, group * 4, h, w}, {group * 4, 1, 1, kernel, kernel}, {1, group * 4, 1, 1}, {}, {}}) / RUN; auto used1 = benchmark1.exec( {{1, group, h, w, 4}, {group, 1, 1, kernel, kernel, 4}, {1, group, 1, 1, 4}, {}, {}}) / RUN; printf("group/h/w/kernel:%zu,%zu,%zu,%zu: nchw: %f ms %f Gflops " "nchw44: " "%f ms %f GFlops " "speedup: %f\n", group, h, w, kernel, used0, computations / used0, used1, computations / used1, used0 / used1); }; for (size_t group : {8, 16, 32, 64}) { for (size_t kerenl : {2, 3, 5}) { run(group, 112, 112, kerenl); run(group, 56, 56, kerenl); run(group, 48, 48, kerenl); run(group, 28, 28, kerenl); run(group, 14, 14, kerenl); } } run(8, 112, 112, 3); run(32, 56, 56, 3); run(64, 28, 28, 3); run(128, 14, 14, 3); } TEST_F(FALLBACK, BENCHMARK_CONVBIAS) { constexpr size_t RUNS = 10; param::ConvBias param; param.stride_h = 1; param.stride_w = 1; Benchmarker benchmarker_int(handle()); benchmarker_int.set_times(RUNS) .set_dtype(0, dtype::QuantizedS8(2.5f)) .set_dtype(1, dtype::QuantizedS8(2.5f)) .set_dtype(2, dtype::QuantizedS32(6.25f)) .set_dtype(4, dtype::QuantizedS8(40.25f)) .set_display(false); Benchmarker benchmarker_float(handle()); benchmarker_float.set_display(false).set_times(RUNS); auto run = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, size_t FS) { TensorShape src({N, IC, H, W}), filter({OC, IC, FS, FS}), bias({N, OC, 1, 1}), z({}), dst({N, OC, H, W}); param.pad_h = FS / 2; param.pad_w = FS / 2; auto int_used = benchmarker_int.set_param(param).exec({src, filter, bias, z, dst}) / RUNS; auto float_used = benchmarker_float.set_param(param).exec({src, filter, bias, z, dst}) / RUNS; float computations = IC * (FS * FS + 1) * dst.total_nr_elems() * 2 * 1e-6; printf("run: %s %s %s->%s \nfloat: %f ms %f Gflops int: %f ms " "%f Gflops speedup: %f\n", src.to_string().c_str(), filter.to_string().c_str(), bias.to_string().c_str(), dst.to_string().c_str(), float_used, computations / float_used, int_used, computations / int_used, float_used / int_used); }; run(1, 128, 128, 32, 32, 3); for (size_t IC : {32, 64, 128}) { for (size_t OC : {32, 64, 128}) { for (size_t size : {28, 56}) { for (size_t FS : {3, 5}) { run(1, IC, OC, size, size, FS); } } } } } TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_4x4) { #if MEGDNN_AARCH64 conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:2", handle(), 3, 4); #elif MEGDNN_ARMV7 conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:2", handle(), 3, 4); #else conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:2", handle(), 3, 4); #endif } void benchmark_winograd_nchw_vs_nchw44( const char* algo_name0, const char* algo_name1, Handle* handle) { using namespace conv_bias; using NLMode = param::ConvBias::NonlineMode; std::vector args_nchw44; std::vector args_nchw; auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, size_t group, NLMode nlmode) { param::ConvBias param; param.format = param::ConvBias::Format::NCHW44; param.stride_h = 1; param.stride_w = 1; param.pad_h = 1; param.pad_w = 1; param.nonlineMode = nlmode; if (group == 1) { param.sparse = param::ConvBias::Sparse::DENSE; args_nchw44.emplace_back( param, TensorShape{n, ic / 4, h, w, 4}, TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, TensorShape{}); param.format = param::ConvBias::Format::NCHW; args_nchw.emplace_back( param, TensorShape{n, ic, h, w}, TensorShape{oc, ic, 3, 3}, TensorShape{}); } else { auto oc_per_group = oc / group; auto ic_per_group = ic / group; param.sparse = param::ConvBias::Sparse::GROUP; args_nchw44.emplace_back( param, TensorShape{n, ic_per_group / 4, h, w, 4}, TensorShape{group, oc_per_group / 4, ic_per_group / 4, 3, 3, 4, 4}, TensorShape{}); param.format = param::ConvBias::Format::NCHW; args_nchw.emplace_back( param, TensorShape{n, ic, h, w}, TensorShape{group, oc_per_group, ic_per_group, 3, 3}, TensorShape{}); } }; std::vector nonlinemode = {NLMode::IDENTITY}; for (auto nlmode : nonlinemode) for (size_t n : {1}) for (size_t group = 1; group <= 1; ++group) { pack(n, 512, 512, 15, 15, group, nlmode); pack(n, 512, 256, 15, 15, group, nlmode); pack(n, 256, 256, 29, 29, group, nlmode); pack(n, 256, 128, 29, 29, group, nlmode); pack(n, 128, 128, 57, 57, group, nlmode); pack(n, 128, 64, 57, 57, group, nlmode); pack(n, 24, 24, 224, 224, group, nlmode); pack(n, 64, 24, 123, 123, group, nlmode); pack(n, 64, 64, 56, 56, group, nlmode); pack(n, 128, 128, 28, 28, group, nlmode); pack(n, 256, 256, 14, 14, group, nlmode); pack(n, 512, 512, 7, 7, group, nlmode); } using namespace conv_bias; constexpr size_t RUN = 10; Benchmarker benchmark_winograd_nchw(handle); benchmark_winograd_nchw.set_display(false); benchmark_winograd_nchw.set_times(RUN); Benchmarker benchmark_winograd_nchw44(handle); benchmark_winograd_nchw44.set_display(false); benchmark_winograd_nchw44.set_times(RUN); std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name0); std::string winograd_nchw44_algo_name = ssprintf("WINOGRAD_NCHW44:%s", algo_name1); for (size_t i = 0; i < args_nchw.size(); ++i) { auto arg_nchw = args_nchw[i]; auto arg_nchw44 = args_nchw44[i]; TensorLayout dst_layout; auto opr = handle->create_operator(); opr->param() = arg_nchw.param; opr->deduce_layout( {arg_nchw.src, dtype::Float32()}, {arg_nchw.filter, dtype::Float32()}, {arg_nchw.bias, dtype::Float32()}, {}, dst_layout); //! dst.nr_elems * IC * FH * FW * 2 float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] * arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 / (1024 * 1024 * 1024) * 1e3; benchmark_winograd_nchw.set_param(arg_nchw.param); auto nchw_used = algo_benchmark( benchmark_winograd_nchw, {arg_nchw.src, arg_nchw.filter, {}, {}, {}}, winograd_nchw_algo_name.c_str()) / RUN; benchmark_winograd_nchw44.set_param(arg_nchw44.param); auto nchw44_used = algo_benchmark( benchmark_winograd_nchw44, {arg_nchw44.src, arg_nchw44.filter, {}, {}, {}}, winograd_nchw44_algo_name.c_str()) / RUN; printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops " "speedup: " "%f\n", arg_nchw.src.to_string().c_str(), arg_nchw.filter.to_string().c_str(), nchw_used, computations / nchw_used, nchw44_used, computations / nchw44_used, nchw_used / nchw44_used); } } TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) { #if MEGDNN_AARCH64 benchmark_winograd_nchw_vs_nchw44( "AARCH64_F32_MK4_4x16:4:2", "AARCH64_F32_MK4_4x16:4:2", handle()); #elif MEGDNN_ARMV7 benchmark_winograd_nchw_vs_nchw44( "ARMV7_F32_MK4_4x8:4:2", "ARMV7_F32_MK4_4x8:4:2", handle()); #else benchmark_winograd_nchw_vs_nchw44( "FB_GI_F32_MK4_4x8:4:2", "FB_GI_F32_MK4_4x8:4:2", handle()); #endif } TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_4x4) { #if MEGDNN_AARCH64 conv_bias::benchmark_winograd("WINOGRAD:AARCH64_F32_MK4_4x16:4:6", handle(), 3, 4); #elif MEGDNN_ARMV7 conv_bias::benchmark_winograd("WINOGRAD:ARMV7_F32_MK4_4x8:4:6", handle(), 3, 4); #else conv_bias::benchmark_winograd("WINOGRAD:FB_GI_F32_MK4_4x8:4:6", handle(), 3, 4); #endif } TEST_F(FALLBACK, BENCHMARK_GI_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) { #if MEGDNN_AARCH64 benchmark_winograd_nchw_vs_nchw44( "AARCH64_F32_MK4_4x16:4:6", "AARCH64_F32_MK4_4x16:4:6", handle()); #elif MEGDNN_ARMV7 benchmark_winograd_nchw_vs_nchw44( "ARMV7_F32_MK4_4x8:4:6", "ARMV7_F32_MK4_4x8:4:6", handle()); #else benchmark_winograd_nchw_vs_nchw44( "FB_GI_F32_MK4_4x8:4:6", "FB_GI_F32_MK4_4x8:4:6", handle()); #endif } #endif } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen