From 142f31a87580738f498f42b4be388742c4dd95ea Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 7 Dec 2020 17:15:55 +0800 Subject: [PATCH] perf(dnn/cuda): change conv_bias heu, prefer dnn chanwise impl, dislike dnn batch gemm conv1x1 GitOrigin-RevId: 323bf6073a7b6e97ce9dfedc8caa49d351e5f96f --- dnn/src/cuda/conv_bias/opr_impl.cpp | 69 ++++++++++++++++------ dnn/test/cuda/chanwise_convolution.cpp | 82 ++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 19 deletions(-) diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index ad793a754..34b89f174 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -6,12 +6,13 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ -#include "src/cuda/conv_bias/opr_impl.h" #include "megdnn/dtype.h" -#include "src/cuda/conv_bias/helper.h" #include "src/cuda/conv_bias/algo.h" +#include "src/cuda/conv_bias/helper.h" +#include "src/cuda/conv_bias/opr_impl.h" #include "src/cuda/handle.h" #include "src/cuda/utils.h" @@ -124,6 +125,44 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( return nullptr; }; + const bool is_chanwise = + (args.filter_meta.format == Param::Format::NCHW && + args.filter_meta.group == src[1]) || + (args.filter_meta.format == Param::Format::NCHW4 && + args.filter_meta.group == src[1] * 4) || + (args.filter_meta.format == Param::Format::NCHW32 && + args.filter_meta.group == src[1] * 32); + // prefer special chanwise impl since as the group conv of cudnn + // whose version is lower than v7.5.0 is still slower than our + // implementation in many channel-wise cases + const bool slow_cudnn_chanwise_impl = + CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5); + //! choose CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM default for large image + const int hw_size = src[2] * src[3]; + //! choose dnn when stride != 1, may need calibrate for different cudnn + //! version + const bool prefer_dnn_chanwise = + slow_cudnn_chanwise_impl || args.filter_meta.stride[0] != 1 || + args.filter_meta.stride[1] != 1 || hw_size < 512; + //! avoid bad case in cudnn, check dnn chanwise impl first + if (is_chanwise) { + if (prefer_dnn_chanwise) { + if (sm_algo_pack.chanwise.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) + return &sm_algo_pack.chanwise; + if (sm_algo_pack.chanwise8x8x32.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) + return &sm_algo_pack.chanwise8x8x32; + } else { + conv_args.dst_layout = &dst_layout; + if (is_cudnn_supported(conv_args)) { + if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { + return algo; + } + } + } + } + //! Prefer CUDNN CONVBIAS. bool cudnn_conv_bias_act_supported = false; for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) { @@ -139,22 +178,10 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( return algo; } - if (args.filter_meta.group > 1) { -#if CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5) - // prefer special chanwise impl since as the group conv of cudnn whose - // version is lower than v7.5.0 is still slower than our implementation - // in many channel-wise cases - if (sm_algo_pack.chanwise.is_available_reproducible( - args, reproducible, workspace_limit_in_bytes)) - return &sm_algo_pack.chanwise; - if (sm_algo_pack.chanwise8x8x32.is_available_reproducible( - args, reproducible, workspace_limit_in_bytes)) - return &sm_algo_pack.chanwise8x8x32; -#endif - } - - if (auto algo = get_1x1_algo(args)) { - return algo; + int batch = src[0]; + if (batch == 1 && sm_algo_pack.a1x1.is_available_reproducible( + args, reproducible, workspace_limit_in_bytes)) { + return &sm_algo_pack.a1x1; } // modify conv_args dst_layout @@ -179,6 +206,10 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( conv_args = orig_args; } + if (auto algo = get_1x1_algo(args)) { + return algo; + } + if (args.src_layout->dtype.enumv() != DTypeTrait::enumv) { if (reproducible) { return megdnn::get_reproducible_algo( diff --git a/dnn/test/cuda/chanwise_convolution.cpp b/dnn/test/cuda/chanwise_convolution.cpp index 626d5a30d..14bbbb390 100644 --- a/dnn/test/cuda/chanwise_convolution.cpp +++ b/dnn/test/cuda/chanwise_convolution.cpp @@ -839,6 +839,88 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_FLOAT_SMALL) { } +TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_CUDNN_DNN) { + CUBenchmarker bencher(handle_cuda()); + size_t RUNS = 1; + bencher.set_display(false).set_times(RUNS); + + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW; + param.sparse = ConvBias::Param::Sparse::GROUP; + NormalRNG rng; + + auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, + size_t s) { + param.pad_h = f / 2; + param.pad_w = f / 2; + param.stride_h = s; + param.stride_w = s; + param.compute_mode = param::ConvBias::ComputeMode::DEFAULT; + + TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f}, + bias = {1, c, 1, 1}; + + TensorLayout dst_layout; + auto opr = handle_cuda()->create_operator(); + opr->param() = param; + opr->deduce_layout({src, dtype::Float32()}, {filter, dtype::Float32()}, + {bias, dtype::Float32()}, {}, dst_layout); + float computation_mops = + static_cast(dst_layout.total_nr_elems() * f * f * 2) * + 1e-6; + + bencher.set_param(param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &rng) + .set_rng(1, &rng); + bencher.set_before_exec_callback( + AlgoChecker(".+CHANNEL_WISE.+")); + auto time_in_ms_dnn = bencher.execs({src, filter, bias, {}, {}}) / RUNS; + + bencher.set_param(param) + .set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &rng) + .set_rng(1, &rng); + bencher.set_before_exec_callback(AlgoChecker( + ".+CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.+")); + auto time_in_ms_cudnn = + bencher.execs({src, filter, bias, {}, {}}) / RUNS; + + printf("stride=%zu src=%s, filter=%s, dst=%s, dnn: %.2fms %.2fGB/s " + "cudnn: %.2fms %.2fGB/s " + "speedup: " + "%0.2f (dnn/cudnn)\n", + s, src.to_string().c_str(), filter.to_string().c_str(), + dst_layout.to_string().c_str(), time_in_ms_dnn, + computation_mops / time_in_ms_dnn, time_in_ms_cudnn, + computation_mops / time_in_ms_cudnn, + time_in_ms_cudnn / time_in_ms_dnn); + }; + + // clang-format off + for(size_t batch:{1, 16, 32, 64, 128}){ + run(batch, 32, 112, 112, 3, 1); + run(batch, 96, 112, 112, 3, 2); + run(batch, 96, 112, 112, 3, 1); + run(batch, 144, 56, 56, 3, 2); + run(batch, 144, 56, 56, 3, 1); + run(batch, 192, 28, 28, 3, 1); + run(batch, 384, 14, 14, 3, 1); + run(batch, 576, 14, 14, 3, 1); + run(batch, 960, 7, 7, 3, 1); + //! calibrate heu algo policy hw_size param + run(batch, 144, 24, 24, 3, 1); + run(batch, 144, 22, 22, 3, 1); + run(batch, 144, 20, 20, 3, 1); + run(batch, 144, 18, 18, 3, 1); + } + // clang-format on +} + TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_FLOAT_SMALL) { CUBenchmarker bencher(handle_cuda()); size_t RUNS = 1; -- GitLab