提交 f7994683 编写于 作者: M Megvii Engine Team

feat(cuda): add large kernel direct conv to heuristic algo chooser

GitOrigin-RevId: bc927b6df736ee807b816d1652d602742c591f8e
上级 6dc0c0b9
...@@ -148,7 +148,9 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( ...@@ -148,7 +148,9 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
//! choose for large kernel cases //! choose for large kernel cases
size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
size_t hi = src[2], wi = src[3]; size_t hi = src[2], wi = src[3];
const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw; const bool prefer_dnn_lk_implbmm =
hi <= 2 * fh && wi <= 2 * fw && wi < 32 && hi <= 32;
const bool prefer_direct_lk = fh > 10 && fw > 10;
//! avoid bad case in cudnn, check dnn chanwise impl first //! avoid bad case in cudnn, check dnn chanwise impl first
if (is_chanwise) { if (is_chanwise) {
if (prefer_dnn_lk_implbmm) { if (prefer_dnn_lk_implbmm) {
...@@ -160,6 +162,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( ...@@ -160,6 +162,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute( if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) args, positive_attr, negative_attr, workspace_limit_in_bytes))
return &sm_algo_pack.f32_implicit_bmm[0]; return &sm_algo_pack.f32_implicit_bmm[0];
} else if (
prefer_direct_lk &&
sm_algo_pack.depthwise_large_filter.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.depthwise_large_filter;
} else if (prefer_dnn_chanwise) { } else if (prefer_dnn_chanwise) {
if (sm_algo_pack.chanwise.is_available_attribute( if (sm_algo_pack.chanwise.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) args, positive_attr, negative_attr, workspace_limit_in_bytes))
......
...@@ -119,7 +119,10 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl:: ...@@ -119,7 +119,10 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::
size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
size_t ho = diff[2], wo = diff[3]; size_t ho = diff[2], wo = diff[3];
const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW && const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW &&
ho <= 2 * fh && wo <= 2 * fw; ho <= 2 * fh && wo <= 2 * fw && ho < 32 &&
wo < 32;
const bool prefer_direct_lk =
args.filter_meta.format == Param::Format::NCHW && fh > 10 && fw > 10;
if (prefer_dnn_lk_implbmm) { if (prefer_dnn_lk_implbmm) {
#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute( if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute(
...@@ -131,6 +134,12 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl:: ...@@ -131,6 +134,12 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::
return &sm_algo_pack.implbmm_nchw_fma[0]; return &sm_algo_pack.implbmm_nchw_fma[0];
} }
if (prefer_direct_lk &&
sm_algo_pack.depthwise_large_filter.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.depthwise_large_filter;
}
if (args.filter_meta.group > 1 && if (args.filter_meta.group > 1 &&
sm_algo_pack.chanwise.is_available_attribute( sm_algo_pack.chanwise.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) { args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册