/** * \file dnn/src/cuda/conv_bias/opr_impl.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "megdnn/dtype.h" #include "src/cuda/conv_bias/algo.h" #include "src/cuda/conv_bias/helper.h" #include "src/cuda/conv_bias/opr_impl.h" #include "src/cuda/handle.h" #include "src/cuda/utils.h" #include "src/common/algo_chooser.h" #include "src/cuda/cudnn_with_check.h" namespace megdnn { namespace cuda { void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, workspace.size, preprocessed_filter); AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace, preprocessed_filter); auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); algo->check_workspace(args, workspace).exec(args); }; std::vector ConvBiasForwardImpl::get_all_algorithms(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { return megdnn::get_all_algorithms( {this, src, filter, bias, z, dst}); } ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { using namespace conv_bias; AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; auto dst_layout = *args.dst_layout; if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { dst_layout.dtype = DType(); args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype); } auto conv_args = args; auto cudnn_conv_bias_act_from_enum_wrapper = [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* { return sm_algo_pack.cudnn_conv_bias_act_from_enum(algo); }; auto cudnn_conv_from_enum_wrapper = [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* { return sm_algo_pack.cudnn_conv_from_enum(algo); }; auto get_cudnn_algo = [this, &conv_args, &args, workspace_limit_in_bytes, positive_attr, negative_attr]( const thin_function& cb) -> AlgoBase* { auto cudnn_handle = cuda::cudnn_handle(this->handle()); CUDNNForwardDescs desc; conv_args.init_conv_desc(desc); #if CUDNN_MAJOR >= 7 auto& cudnn = static_cast(this->handle())->cudnn(); int max_count = 0; cudnn_check(cudnn.GetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_count)); SmallVector algo_perf(max_count); int ret_count = 0; cudnn_check(cudnn.GetConvolutionForwardAlgorithm_v7( cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc, desc.conv_desc.conv_desc, desc.dst_desc.desc, max_count, &ret_count, algo_perf.data())); for (int i = 0; i < ret_count; ++i) { auto conv_bias_algo = cb(algo_perf[i].algo); if (conv_bias_algo->is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return conv_bias_algo; } } #else cudnnConvolutionFwdAlgo_t algo; cudnn_check(cudnnGetConvolutionForwardAlgorithm( cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc, desc.conv_desc.conv_desc, desc.dst_desc.desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_in_bytes, &algo)); auto conv_bias_algo = cb(algo); if (conv_bias_algo->is_available_attribute(args, positive_attr, negative_attr, workspace_limit_in_bytes)) return conv_bias_algo; #endif return nullptr; }; auto get_1x1_algo = [workspace_limit_in_bytes, positive_attr, negative_attr](const AlgoBase::SizeArgs& size_arg) -> ConvBiasForwardImpl::AlgoBase* { if (sm_algo_pack.batched_matmul.is_available_attribute( size_arg, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.batched_matmul; } return nullptr; }; const bool is_chanwise = (args.filter_meta.format == Param::Format::NCHW && args.filter_meta.group == src[1]) || (args.filter_meta.format == Param::Format::NCHW4 && args.filter_meta.group == src[1] * 4) || (args.filter_meta.format == Param::Format::NCHW32 && args.filter_meta.group == src[1] * 32); // prefer special chanwise impl since as the group conv of cudnn // whose version is lower than v7.5.0 is still slower than our // implementation in many channel-wise cases const bool slow_cudnn_chanwise_impl = CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5); //! choose CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM default for large image const int hw_size = src[2] * src[3]; //! choose dnn when stride != 1, may need calibrate for different cudnn //! version const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl || args.filter_meta.stride[0] != 1 || args.filter_meta.stride[1] != 1 || hw_size < 512; //! avoid bad case in cudnn, check dnn chanwise impl first if (is_chanwise) { if (prefer_dnn_chanwise) { if (sm_algo_pack.chanwise.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.chanwise; if (sm_algo_pack.chanwise8x8x32.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) return &sm_algo_pack.chanwise8x8x32; } else { conv_args.dst_layout = &dst_layout; if (is_cudnn_supported(conv_args)) { if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { return algo; } } } } //! Prefer CUDNN CONVBIAS. bool cudnn_conv_bias_act_supported = false; for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) { if (algo.is_available_attribute(args, positive_attr, negative_attr, workspace_limit_in_bytes)) { cudnn_conv_bias_act_supported = true; break; } } if (cudnn_conv_bias_act_supported) { if (auto algo = get_cudnn_algo(cudnn_conv_bias_act_from_enum_wrapper)) return algo; } // modify conv_args dst_layout conv_args.dst_layout = &dst_layout; if (is_cudnn_supported(conv_args)) { if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) return algo; } if (args.filter_meta.group > 1) { auto orig_args = conv_args; TensorLayout src, dst, bias; AlgoGroupConvGeneral::modify_size_args(conv_args, src, dst, bias); if (auto algo = get_1x1_algo(conv_args)) { return sm_algo_pack.algo2gconv.at(algo); } if (is_cudnn_supported(conv_args)) { if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { return sm_algo_pack.algo2gconv.at(algo); } } conv_args = orig_args; } if (auto algo = get_1x1_algo(args)) { return algo; } if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.fallback_nchw_qs8; } if (args.src_layout->dtype.enumv() != DTypeTrait::enumv) { return megdnn::get_algo_match_attribute( sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes, "cuda convbias fwd", positive_attr, negative_attr); } else { return megdnn::get_algo_match_attribute( sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes, "cuda convbias fwd", positive_attr, negative_attr); } } const char* ConvBiasForwardImpl::get_algorithm_set_name() const { return "CONV_BIAS_CUDA"; } size_t ConvBiasForwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { AlgoBase::SizeArgs args{ this, src, filter, bias, z, dst, preprocessed_filter}; return get_algorithm(this, src, filter, bias, z, dst) ->get_workspace_in_bytes(args); }; size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; return get_algorithm(this, src, filter, bias, z, dst) ->get_preprocess_workspace_in_bytes(args); } SmallVector ConvBiasForwardImpl::deduce_preprocessed_filter_layout( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; return get_algorithm(this, src, filter, bias, z, dst) ->deduce_preprocessed_filter_layout(args); } void ConvBiasForwardImpl::exec_preprocess( const TensorLayout& src_layout, _megdnn_tensor_in filter, _megdnn_tensor_in bias, const TensorLayout& z_layout, const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, z{nullptr, z_layout}; AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace, preprocessed_filter); auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); return algo->exec_preprocess(args); } } // namespace cuda } // namespace megdnn // vim: syntax=cpp.doxygen