perf(dnn/cuda): change conv_bias heu, prefer dnn chanwise impl, dislike dnn batch gemm conv1x1

GitOrigin-RevId: 323bf6073a7b6e97ce9dfedc8caa49d351e5f96f

perf(dnn/cuda): change conv_bias heu, prefer dnn chanwise impl, dislike dnn batch gemm conv1x1
GitOrigin-RevId: 323bf6073a7b6e97ce9dfedc8caa49d351e5f96f
142f31a8 · Megvii Engine Team · f214e146 · 142f31a8 · 142f31a8
显示空白变更内容
内联并排

Showing with 132 addition and 19 deletion

dnn/src/cuda/conv_bias/opr_impl.cpp dnn/src/cuda/conv_bias/opr_impl.cpp +50 -19

dnn/test/cuda/chanwise_convolution.cpp dnn/test/cuda/chanwise_convolution.cpp +82 -0

未找到文件。
--- a/dnn/src/cuda/conv_bias/opr_impl.cpp
+++ b/dnn/src/cuda/conv_bias/opr_impl.cpp
@@ -6,12 +6,13 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */
-#include "src/cuda/conv_bias/opr_impl.h"
 #include "megdnn/dtype.h"
-#include "src/cuda/conv_bias/helper.h"
 #include "src/cuda/conv_bias/algo.h"
+#include "src/cuda/conv_bias/helper.h"
+#include "src/cuda/conv_bias/opr_impl.h"
 #include "src/cuda/handle.h"
 #include "src/cuda/utils.h"

@@ -124,6 +125,44 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
        return nullptr;
    };

+    const bool is_chanwise =
+            (args.filter_meta.format == Param::Format::NCHW &&
+             args.filter_meta.group == src[1]) ||
+            (args.filter_meta.format == Param::Format::NCHW4 &&
+             args.filter_meta.group == src[1] * 4) ||
+            (args.filter_meta.format == Param::Format::NCHW32 &&
+             args.filter_meta.group == src[1] * 32);
+    // prefer special chanwise impl since as the group conv of cudnn
+    // whose version is lower than v7.5.0 is still slower than our
+    // implementation in many channel-wise cases
+    const bool slow_cudnn_chanwise_impl =
+            CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5);
+    //! choose CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM default for large image
+    const int hw_size = src[2] * src[3];
+    //! choose dnn when stride != 1, may need calibrate for different cudnn
+    //! version
+    const bool prefer_dnn_chanwise =
+            slow_cudnn_chanwise_impl || args.filter_meta.stride[0] != 1 ||
+            args.filter_meta.stride[1] != 1 || hw_size < 512;
+    //! avoid bad case in cudnn, check dnn chanwise impl first
+    if (is_chanwise) {
+        if (prefer_dnn_chanwise) {
+            if (sm_algo_pack.chanwise.is_available_reproducible(
+                        args, reproducible, workspace_limit_in_bytes))
+                return &sm_algo_pack.chanwise;
+            if (sm_algo_pack.chanwise8x8x32.is_available_reproducible(
+                        args, reproducible, workspace_limit_in_bytes))
+                return &sm_algo_pack.chanwise8x8x32;
+        } else {
+            conv_args.dst_layout = &dst_layout;
+            if (is_cudnn_supported(conv_args)) {
+                if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) {
+                    return algo;
+                }
+            }
+        }
+    }
+
    //! Prefer CUDNN CONVBIAS.
    bool cudnn_conv_bias_act_supported = false;
    for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) {
@@ -139,22 +178,10 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
            return algo;
    }

-    if (args.filter_meta.group > 1) {
-#if CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5)
-        // prefer special chanwise impl since as the group conv of cudnn whose
-        // version is lower than v7.5.0 is still slower than our implementation
-        // in many channel-wise cases
-        if (sm_algo_pack.chanwise.is_available_reproducible(
-                    args, reproducible, workspace_limit_in_bytes))
-            return &sm_algo_pack.chanwise;
-        if (sm_algo_pack.chanwise8x8x32.is_available_reproducible(
-                    args, reproducible, workspace_limit_in_bytes))
-            return &sm_algo_pack.chanwise8x8x32;
-#endif
-    }
-
-    if (auto algo = get_1x1_algo(args)) {
-        return algo;
+    int batch = src[0];
+    if (batch == 1 && sm_algo_pack.a1x1.is_available_reproducible(
+                              args, reproducible, workspace_limit_in_bytes)) {
+        return &sm_algo_pack.a1x1;
    }

    // modify conv_args dst_layout
@@ -179,6 +206,10 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
        conv_args = orig_args;
    }

+    if (auto algo = get_1x1_algo(args)) {
+        return algo;
+    }
+
    if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) {
        if (reproducible) {
            return megdnn::get_reproducible_algo<ConvBiasForwardImpl>(

--- a/dnn/test/cuda/chanwise_convolution.cpp
+++ b/dnn/test/cuda/chanwise_convolution.cpp
@@ -839,6 +839,88 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) {

 }

+TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) {
+    CUBenchmarker<ConvBiasForward> bencher(handle_cuda());
+    size_t RUNS = 1;
+    bencher.set_display(false).set_times(RUNS);
+
+    ConvBias::Param param;
+    param.format = ConvBias::Param::Format::NCHW;
+    param.sparse = ConvBias::Param::Sparse::GROUP;
+    NormalRNG rng;
+
+    auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f,
+                   size_t s) {
+        param.pad_h = f / 2;
+        param.pad_w = f / 2;
+        param.stride_h = s;
+        param.stride_w = s;
+        param.compute_mode = param::ConvBias::ComputeMode::DEFAULT;
+
+        TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f},
+                    bias = {1, c, 1, 1};
+
+        TensorLayout dst_layout;
+        auto opr = handle_cuda()->create_operator<ConvBias>();
+        opr->param() = param;
+        opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()},
+                           {bias, dtype::Float32()}, {}, dst_layout);
+        float computation_mops =
+                static_cast<float>(dst_layout.total_nr_elems() * f * f * 2) *
+                1e-6;
+
+        bencher.set_param(param)
+                .set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_dtype(2, dtype::Float32())
+                .set_rng(0, &rng)
+                .set_rng(1, &rng);
+        bencher.set_before_exec_callback(
+                AlgoChecker<ConvBiasForward>(".+CHANNEL_WISE.+"));
+        auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS;
+
+        bencher.set_param(param)
+                .set_dtype(0, dtype::Float32())
+                .set_dtype(1, dtype::Float32())
+                .set_dtype(2, dtype::Float32())
+                .set_rng(0, &rng)
+                .set_rng(1, &rng);
+        bencher.set_before_exec_callback(AlgoChecker<ConvBiasForward>(
+                ".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+"));
+        auto time_in_ms_cudnn =
+                bencher.execs({src, filter, bias, {}, {}}) / RUNS;
+
+        printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s "
+               "cudnn: %.2fms %.2fGB/s "
+               "speedup: "
+               "%0.2f (dnn/cudnn)\n",
+               s, src.to_string().c_str(), filter.to_string().c_str(),
+               dst_layout.to_string().c_str(), time_in_ms_dnn,
+               computation_mops / time_in_ms_dnn, time_in_ms_cudnn,
+               computation_mops / time_in_ms_cudnn,
+               time_in_ms_cudnn / time_in_ms_dnn);
+    };
+
+    // clang-format off
+    for(size_t batch:{1, 16, 32, 64, 128}){
+        run(batch, 32, 112, 112, 3, 1);
+        run(batch, 96, 112, 112, 3, 2);
+        run(batch, 96, 112, 112, 3, 1);
+        run(batch, 144, 56, 56, 3, 2);
+        run(batch, 144, 56, 56, 3, 1);
+        run(batch, 192, 28, 28, 3, 1);
+        run(batch, 384, 14, 14, 3, 1);
+        run(batch, 576, 14, 14, 3, 1);
+        run(batch, 960, 7, 7, 3, 1);
+        //! calibrate heu algo policy hw_size param
+        run(batch, 144, 24, 24, 3, 1);
+        run(batch, 144, 22, 22, 3, 1);
+        run(batch, 144, 20, 20, 3, 1);
+        run(batch, 144, 18, 18, 3, 1);
+    }
+    // clang-format on
+}
+
 TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) {
    CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
    size_t RUNS = 1;