#include "src/cuda/conv_bias/opr_impl.h" #include "megdnn/dtype.h" #include "src/cuda/conv_bias/algo.h" #include "src/cuda/conv_bias/helper.h" #include "src/cuda/handle.h" #include "src/cuda/utils.h" #include "src/common/algo_chooser.h" #include "src/common/conv_bias.h" #include "src/cuda/cudnn_with_check.h" namespace megdnn { namespace cuda { void ConvBiasForwardImpl::exec( _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { check_exec_allow_noncontiguous( src.layout, filter.layout, bias.layout, z.layout, dst.layout, workspace.size, preprocessed_filter); AlgoBase::ExecArgs args( this, src, filter, bias, z, dst, workspace, preprocessed_filter); auto algo = get_algorithm( this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); algo->exec(args); }; std::vector ConvBiasForwardImpl::get_all_algorithms( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { return megdnn::get_all_algorithms( {this, src, filter, bias, z, dst}); } std::vector ConvBiasForwardImpl::get_all_algorithms_safe( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { return megdnn::get_all_algorithms_safe( {this, src, filter, bias, z, dst}); } ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { using namespace conv_bias; AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; #if CUDNN_VERSION >= 8004 if (sm_algo_pack.cudnn_conv_v8.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.cudnn_conv_v8; } if (sm_algo_pack.cudnn_conv_bias_activation_v8.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.cudnn_conv_bias_activation_v8; } #endif auto dst_layout = *args.dst_layout; if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { dst_layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd( args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype); } auto conv_args = args; auto cudnn_conv_bias_act_from_enum_wrapper = [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* { return sm_algo_pack.cudnn_conv_bias_act_from_enum(algo); }; auto cudnn_conv_from_enum_wrapper = [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* { return sm_algo_pack.cudnn_conv_from_enum(algo); }; auto get_cudnn_algo = [this, &conv_args, &args, workspace_limit_in_bytes, positive_attr, negative_attr]( const thin_function& cb) -> AlgoBase* { auto cudnn_handle = cuda::cudnn_handle(this->handle()); CUDNNForwardDescs desc; conv_args.init_conv_desc(desc); #if CUDNN_MAJOR >= 7 int max_count = 0; cudnn_check( cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_count)); SmallVector algo_perf(max_count); int ret_count = 0; cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7( cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc, desc.conv_desc.conv_desc, desc.dst_desc.desc, max_count, &ret_count, algo_perf.data())); for (int i = 0; i < ret_count; ++i) { auto conv_bias_algo = cb(algo_perf[i].algo); if (conv_bias_algo->is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return conv_bias_algo; } } #else cudnnConvolutionFwdAlgo_t algo; cudnn_check(cudnnGetConvolutionForwardAlgorithm( cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc, desc.conv_desc.conv_desc, desc.dst_desc.desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_in_bytes, &algo)); auto conv_bias_algo = cb(algo); if (conv_bias_algo->is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return conv_bias_algo; #endif return nullptr; }; auto get_1x1_algo = [workspace_limit_in_bytes, positive_attr, negative_attr](const AlgoBase::SizeArgs& size_arg) -> ConvBiasForwardImpl::AlgoBase* { if (sm_algo_pack.batched_matmul.is_available_attribute( size_arg, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.batched_matmul; } return nullptr; }; const bool is_chanwise = (args.filter_meta.format == Param::Format::NCHW && args.filter_meta.group == src[1]) || (args.filter_meta.format == Param::Format::NCHW4 && args.filter_meta.group == src[1] * 4) || (args.filter_meta.format == Param::Format::NCHW32 && args.filter_meta.group == src[1] * 32); // prefer special chanwise impl since as the group conv of cudnn // whose version is lower than v7.5.0 is still slower than our // implementation in many channel-wise cases const bool slow_cudnn_chanwise_impl = CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5); //! choose CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM default for large image const int hw_size = src[2] * src[3]; //! choose dnn when stride != 1, may need calibrate for different cudnn //! version const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl || args.filter_meta.stride[0] != 1 || args.filter_meta.stride[1] != 1 || hw_size < 512; //! choose for large kernel cases size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; size_t hi = src[2], wi = src[3]; const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw; //! filter size > 9, choose large kernel cases const bool prefer_direct_lk = fh > 9 && fw > 9; //! avoid bad case in cudnn, check dnn chanwise impl first if (is_chanwise) { if (prefer_dnn_lk_implbmm) { #if CUDA_VERSION >= 10020 if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.f16_implicit_bmm[0]; #endif if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.f32_implicit_bmm[0]; } else if ( prefer_direct_lk && sm_algo_pack.depthwise_large_filter.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.depthwise_large_filter; } else if (prefer_dnn_chanwise) { if (sm_algo_pack.chanwise.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.chanwise; if (sm_algo_pack.chanwise8x8x32.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.chanwise8x8x32; } else { conv_args.dst_layout = &dst_layout; if (is_cudnn_supported(conv_args)) { if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { return algo; } } } } //! Prefer CUDNN CONVBIAS. bool cudnn_conv_bias_act_supported = false; for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) { if (algo.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { cudnn_conv_bias_act_supported = true; break; } } if (cudnn_conv_bias_act_supported) { if (auto algo = get_cudnn_algo(cudnn_conv_bias_act_from_enum_wrapper)) return algo; } // modify conv_args dst_layout conv_args.dst_layout = &dst_layout; if (is_cudnn_supported(conv_args)) { if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) return algo; } if (auto algo = get_1x1_algo(args)) { return algo; } if (args.filter_meta.group > 1 && sm_algo_pack.group.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.group; } if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.fallback_nchw_qs8; } if (sm_algo_pack.int1_simple.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.int1_simple; } if (args.src_layout->dtype.enumv() != DTypeTrait::enumv) { return megdnn::get_algo_match_attribute( sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, "cuda convbias fwd", positive_attr, negative_attr); } else { return megdnn::get_algo_match_attribute( sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes, "cuda convbias fwd", positive_attr, negative_attr); } } const char* ConvBiasForwardImpl::get_algorithm_set_name() const { return "CONV_BIAS_CUDA"; } size_t ConvBiasForwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { TensorLayoutArray layouts{src, filter, bias, z, dst}; AlgorithmCache::Key key{this->handle(), this->get_opr_type(), layouts.data(), layouts.size(), &this->param(), sizeof(this->param())}; auto rst = AlgorithmCache::instance().get(key); if (rst.policy.algo.valid()) { return rst.workspace; } AlgoBase::SizeArgs args{this, src, filter, bias, z, dst, preprocessed_filter}; return get_algorithm(this, src, filter, bias, z, dst)->get_workspace_in_bytes(args); }; size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; return get_algorithm(this, src, filter, bias, z, dst) ->get_preprocess_workspace_in_bytes(args); } SmallVector ConvBiasForwardImpl::deduce_preprocessed_filter_layout( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; return get_algorithm(this, src, filter, bias, z, dst) ->deduce_preprocessed_filter_layout(args); } void ConvBiasForwardImpl::exec_preprocess( const TensorLayout& src_layout, _megdnn_tensor_in filter, _megdnn_tensor_in bias, const TensorLayout& z_layout, const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, z{nullptr, z_layout}; AlgoBase::ExecArgs args( this, src, filter, bias, z, dst, workspace, preprocessed_filter); auto algo = get_algorithm( this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); return algo->exec_preprocess(args); } } // namespace cuda } // namespace megdnn // vim: syntax=cpp.doxygen