perf(mgb): disable FoldingConvBiasDimshufflePass in cuda10 for performance

GitOrigin-RevId: d1b95a6f01ba73f98c0094e00fee3e61e9139628

perf(mgb): disable FoldingConvBiasDimshufflePass in cuda10 for performance
GitOrigin-RevId: d1b95a6f01ba73f98c0094e00fee3e61e9139628
869a0327 · Megvii Engine Team · 0baf6b0d · 869a0327 · 869a0327 · 869a0327
5 changed file
--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
            param::ConvBias::Format::CHWN4);
 }

+TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) {
+    CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda());
+    size_t RUNS = 1000;
+    benchmarker.set_display(false).set_times(RUNS);
+
+    using namespace conv_bias;
+    UniformIntRNG int_rng{-3, 3};
+    UniformIntRNG bias_rng{-50, 50};
+    ConvBias::Param param;
+    param.format = ConvBias::Param::Format::NCHW4_NCHW;
+    param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
+
+    benchmarker.set_before_exec_callback(
+            conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
+                    "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
+
+    benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
+            .set_dtype(1, dtype::QuantizedS8(1.9980927f))
+            .set_dtype(2, dtype::Float32())
+            .set_dtype(3, dtype::Float32())
+            .set_dtype(4, dtype::Float32())
+            .set_rng(0, &int_rng)
+            .set_rng(1, &int_rng)
+            .set_param(param);
+
+    auto run = [&](const TensorShapeArray& shapes) {
+        auto time_in_ms =
+                benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) /
+                RUNS;
+
+        printf("src=%s, filter=%s, dst=%s, time=%.2f\n",
+               shapes[0].to_string().c_str(), shapes[1].to_string().c_str(),
+               shapes[2].to_string().c_str(), time_in_ms);
+    };
+
+    run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
+    run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
+    run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
+}
+

 #if CUDA_VERSION >= 10020
 TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {

--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
        add_pass<RemoveRedundantTypeCvtPass>();
        add_pass(FuseNCHW4Int8Preprocess::make());
        add_pass<FuseWarpPerspectiveDimshufflePass>();
+#if CUDA_VERSION >= 10020
        add_pass<FoldingConvBiasDimshufflePass>();
+#endif
    });
    cb(chwn4, {
        add_pass<FuseConvBiasNonlinPass>();
@@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
        add_pass<RemoveRedundantTypeCvtPass>();
        add_pass(FuseNCHW4Int8Preprocess::make());
        add_pass<FuseWarpPerspectiveDimshufflePass>();
+#if CUDA_VERSION >= 10020
        add_pass<FoldingConvBiasDimshufflePass>();
+#endif
    });

    cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); });

--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
@@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const {
    MIDOUT_E
 }

+#if CUDA_VERSION >= 10020
 /* ==================== FoldingConvBiasDimshufflePass ================= */
 const char* FoldingConvBiasDimshufflePass::name() const {
    return mgb_cstr_log("folding conv bias dimshuffle pass");
@@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
        return true;
    };
    MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4);
+    MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32);

    auto on_opr = [&try_conv_dimshuffle_reshape_typecvt,
                   &try_conv_reformat_nchw42nchw32,
                   &try_conv_reformat_nchw42nhwc,
-#if CUDA_VERSION >= 10020
                   &try_conv_reformat_nchw322nchw4,
-#endif
                   &rewriter](OperatorNodeBase* opr) {
        if (!try_conv_dimshuffle_reshape_typecvt(opr) &&
            !try_conv_reformat_nchw42nchw32(opr) &&
            !try_conv_reformat_nchw42nhwc(opr)
-#if CUDA_VERSION >= 10020
            && !try_conv_reformat_nchw322nchw4(opr)
-#endif
        ) {
            rewriter.auto_replace_outputs(opr);
        }
@@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {

    MIDOUT_E
 }
+#endif

 /* ==================== PaddingChannelPass ================= */
 const char* PaddingChannelPass::name() const {

--- a/src/gopt/include/megbrain/gopt/inference.h
+++ b/src/gopt/include/megbrain/gopt/inference.h
@@ -16,6 +16,10 @@
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/search_policy/algo_chooser_helper.h"

+#if MGB_CUDA
+#include <cuda.h>
+#endif
+
 namespace mgb {
 namespace gopt {

@@ -427,11 +431,13 @@ namespace gopt {
            void apply(OptState& opt) const override;
    };

+#if CUDA_VERSION >= 10020
    class FoldingConvBiasDimshufflePass final : public Pass {
        public:
            const char* name() const override;
            void apply(OptState& opt) const override;
    };
+#endif

    /*!
     * \brief padding channel to enable fast int8/int4 support

--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) {
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
 }

+#if CUDA_VERSION >= 10020
 TEST(TestGoptInference, FoldingConvDimshuffle) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
@@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
    MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
 }

-#if CUDA_VERSION >= 10020
 TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");