From dcc9693582fa559ba2ae1e49d1c6df853136c250 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 11 Feb 2022 18:07:54 +0800 Subject: [PATCH] feat(dnn/cuda): add heuristic rule for implicit batched gemm large kernel dwconv2d kernels GitOrigin-RevId: 2d2c213bfdf91e85b2513cafb1dda0f6940199e5 --- dnn/src/cuda/conv_bias/opr_impl.cpp | 13 ++++++++++++- dnn/src/cuda/convolution/opr_impl.cpp | 13 +++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index bf3742785..17ea6eb94 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -145,9 +145,20 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl || args.filter_meta.stride[0] != 1 || args.filter_meta.stride[1] != 1 || hw_size < 512; + //! choose for large kernel cases + size_t fh = args.filter_meta.spatial[2], fw = args.filter_meta.spatial[3]; + size_t hi = src[2], wi = src[3]; + const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw; //! avoid bad case in cudnn, check dnn chanwise impl first if (is_chanwise) { - if (prefer_dnn_chanwise) { + if (prefer_dnn_lk_implbmm) { + if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) + return &sm_algo_pack.f16_implicit_bmm[0]; + if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) + return &sm_algo_pack.f32_implicit_bmm[0]; + } else if (prefer_dnn_chanwise) { if (sm_algo_pack.chanwise.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.chanwise; diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index c39563ca0..c1d1784d4 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -115,6 +115,19 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl:: const AlgoAttribute& negative_attr) { AlgoBase::SizeArgs args(this, filter, diff, grad); + //! choose for large kernel cases + size_t fh = args.filter_meta.spatial[2], fw = args.filter_meta.spatial[3]; + size_t ho = diff[2], wo = diff[3]; + const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW && + ho <= 2 * fh && wo <= 2 * fw; + if (prefer_dnn_lk_implbmm) { + if (sm_algo_pack.implbmm_nchw_hmma.is_available_attribute( + args, positive_attr, negative_attr, workspace_limit_in_bytes)) + return &sm_algo_pack.implbmm_nchw_hmma[0]; + if (sm_algo_pack.implbmm_nchw_fma.is_available_attribute(args, positive_attr, negative_attr, workspace_limit_in_bytes)) + return &sm_algo_pack.implbmm_nchw_fma[0]; + } + if (args.filter_meta.group > 1 && sm_algo_pack.chanwise.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { -- GitLab