From 869a03271b463aa1501c36b2d7c42ec3f444e748 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 13 Jul 2021 13:13:03 +0800 Subject: [PATCH] perf(mgb): disable FoldingConvBiasDimshufflePass in cuda10 for performance GitOrigin-RevId: d1b95a6f01ba73f98c0094e00fee3e61e9139628 --- dnn/test/cuda/conv_bias_int8.cpp | 40 ++++++++++++++++++++++ src/gopt/impl/framework.cpp | 4 +++ src/gopt/impl/tensor_reformat.cpp | 7 ++-- src/gopt/include/megbrain/gopt/inference.h | 6 ++++ src/gopt/test/inference.cpp | 2 +- 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index df9da234e..b4f3ebaed 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) { param::ConvBias::Format::CHWN4); } +TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) { + CUBenchmarker benchmarker(handle_cuda()); + size_t RUNS = 1000; + benchmarker.set_display(false).set_times(RUNS); + + using namespace conv_bias; + UniformIntRNG int_rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW4_NCHW; + param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; + + benchmarker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker( + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); + + benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_param(param); + + auto run = [&](const TensorShapeArray& shapes) { + auto time_in_ms = + benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) / + RUNS; + + printf("src=%s, filter=%s, dst=%s, time=%.2f\n", + shapes[0].to_string().c_str(), shapes[1].to_string().c_str(), + shapes[2].to_string().c_str(), time_in_ms); + }; + + run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); + run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); + run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); +} + #if CUDA_VERSION >= 10020 TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp index 9a914afc9..ec4ef7a9d 100644 --- a/src/gopt/impl/framework.cpp +++ b/src/gopt/impl/framework.cpp @@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( add_pass(); add_pass(FuseNCHW4Int8Preprocess::make()); add_pass(); +#if CUDA_VERSION >= 10020 add_pass(); +#endif }); cb(chwn4, { add_pass(); @@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( add_pass(); add_pass(FuseNCHW4Int8Preprocess::make()); add_pass(); +#if CUDA_VERSION >= 10020 add_pass(); +#endif }); cb(fuse_conv_bias_nonlinearity, { add_pass(); }); diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index cfbf72ba5..c2fb74968 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { MIDOUT_E } +#if CUDA_VERSION >= 10020 /* ==================== FoldingConvBiasDimshufflePass ================= */ const char* FoldingConvBiasDimshufflePass::name() const { return mgb_cstr_log("folding conv bias dimshuffle pass"); @@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { return true; }; MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); + MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32); auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, &try_conv_reformat_nchw42nchw32, &try_conv_reformat_nchw42nhwc, -#if CUDA_VERSION >= 10020 &try_conv_reformat_nchw322nchw4, -#endif &rewriter](OperatorNodeBase* opr) { if (!try_conv_dimshuffle_reshape_typecvt(opr) && !try_conv_reformat_nchw42nchw32(opr) && !try_conv_reformat_nchw42nhwc(opr) -#if CUDA_VERSION >= 10020 && !try_conv_reformat_nchw322nchw4(opr) -#endif ) { rewriter.auto_replace_outputs(opr); } @@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { MIDOUT_E } +#endif /* ==================== PaddingChannelPass ================= */ const char* PaddingChannelPass::name() const { diff --git a/src/gopt/include/megbrain/gopt/inference.h b/src/gopt/include/megbrain/gopt/inference.h index d034fd8a9..76e831ea3 100644 --- a/src/gopt/include/megbrain/gopt/inference.h +++ b/src/gopt/include/megbrain/gopt/inference.h @@ -16,6 +16,10 @@ #include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h" +#if MGB_CUDA +#include +#endif + namespace mgb { namespace gopt { @@ -427,11 +431,13 @@ namespace gopt { void apply(OptState& opt) const override; }; +#if CUDA_VERSION >= 10020 class FoldingConvBiasDimshufflePass final : public Pass { public: const char* name() const override; void apply(OptState& opt) const override; }; +#endif /*! * \brief padding channel to enable fast int8/int4 support diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index d988cb0f9..ad1d17206 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) { MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); } +#if CUDA_VERSION >= 10020 TEST(TestGoptInference, FoldingConvDimshuffle) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); @@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); } -#if CUDA_VERSION >= 10020 TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); -- GitLab