From f7994683bd83584683629e65554c0d4440f09c62 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 7 Mar 2022 20:00:14 +0800 Subject: [PATCH] feat(cuda): add large kernel direct conv to heuristic algo chooser GitOrigin-RevId: bc927b6df736ee807b816d1652d602742c591f8e --- dnn/src/cuda/conv_bias/opr_impl.cpp | 9 ++++++++- dnn/src/cuda/convolution/opr_impl.cpp | 11 ++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index 2cb9a04a9..0a606e2ba 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -148,7 +148,9 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( //! choose for large kernel cases size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; size_t hi = src[2], wi = src[3]; - const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw; + const bool prefer_dnn_lk_implbmm = + hi <= 2 * fh && wi <= 2 * fw && wi < 32 && hi <= 32; + const bool prefer_direct_lk = fh > 10 && fw > 10; //! avoid bad case in cudnn, check dnn chanwise impl first if (is_chanwise) { if (prefer_dnn_lk_implbmm) { @@ -160,6 +162,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.f32_implicit_bmm[0]; + } else if ( + prefer_direct_lk && + sm_algo_pack.depthwise_large_filter.is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) { + return &sm_algo_pack.depthwise_large_filter; } else if (prefer_dnn_chanwise) { if (sm_algo_pack.chanwise.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index eefacf57c..5a0109cfe 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -119,7 +119,10 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl:: size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; size_t ho = diff[2], wo = diff[3]; const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW && - ho <= 2 * fh && wo <= 2 * fw; + ho <= 2 * fh && wo <= 2 * fw && ho < 32 && + wo < 32; + const bool prefer_direct_lk = + args.filter_meta.format == Param::Format::NCHW && fh > 10 && fw > 10; if (prefer_dnn_lk_implbmm) { #if CUDA_VERSION >= 10020 if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute( @@ -131,6 +134,12 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl:: return &sm_algo_pack.implbmm_nchw_fma[0]; } + if (prefer_direct_lk && + sm_algo_pack.depthwise_large_filter.is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) { + return &sm_algo_pack.depthwise_large_filter; + } + if (args.filter_meta.group > 1 && sm_algo_pack.chanwise.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { -- GitLab