diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 25b45f281a799ade12ec9cbfb8fb262dbc572196..c2d91b31c741e275ca9c7a3f0f3094d49667af08 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/operator_kernel_configs.h" @@ -90,6 +91,75 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) { return out; } +inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) { + int max_algos = 0; +#if CUDNN_VERSION_MIN(7, 0, 1) + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + cudnn_handle, &max_algos)); +#endif + return max_algos; +} + +template +void AlgoFinalSelect(const std::vector& perf_results, + std::string kernel_name, int32_t algo_preference, + size_t workspace_byte, + cudnnConvolutionBwdFilterAlgo_t* algo, + bool deterministic) { + // Determine the fastest acceptable algo that matches the algo_preference (-1 + // = any), + // regardless of mathType. + + VLOG(3) << "=========Full results of algo=========" << kernel_name << ":"; + for (const auto& result : perf_results) { + auto math_type_str = "-"; + if (result.mathType == CUDNN_TENSOR_OP_MATH) { + math_type_str = "+"; + } + + VLOG(3) << " algo: " << result.algo << ", TC" << math_type_str + << ", time: " << result.time << " ms" + << ", wksp = " << result.memory << ", status = " << result.status; + } + + for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) { + const auto& result = perf_results[i]; + bool algo_is_tensor_core = false; + algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH; + bool algo_exclusion = 0; + if (result.status == CUDNN_STATUS_SUCCESS && + (!deterministic || + result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) && + (result.memory <= workspace_byte) && + (algo_preference == -1 || algo_preference == result.algo) && + !algo_exclusion) { + if ((result.mathType == CUDNN_TENSOR_OP_MATH) && + (i != perf_results.size() - 1)) { + const auto& next_result = perf_results[i + 1]; + if (next_result.status == CUDNN_STATUS_SUCCESS && + next_result.algo == result.algo && + next_result.memory == result.memory && + next_result.mathType != CUDNN_TENSOR_OP_MATH && + next_result.time < 1.01 * result.time) { + // Skip over this result- it's not really a Tensor Core algo. + // Prefer instead the next equivalent non-Tensor Core algo. + continue; + } + } + *algo = result.algo; + auto math_type_str = "-"; + if (result.mathType == CUDNN_TENSOR_OP_MATH) { + math_type_str = "+"; + } + VLOG(3) << " choose algo: " << result.algo << ", TC" << math_type_str + << ", time: " << result.time << " ms" + << ", wksp = " << result.memory << ", status = " << result.status; + return; + } + } +} + using framework::ConvSearchCache; struct ConvArgs { @@ -396,6 +466,7 @@ struct SearchAlgorithm { algo_t algo; if (!exhaustive && !deterministic) { #if CUDNN_VERSION >= 7001 + /* using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; int perf_count; int best_algo_idx = 0; @@ -411,7 +482,39 @@ struct SearchAlgorithm { if (workspace_size > workspace_size_limit) { workspace_size = workspace_size_limit; } + + auto math_type_str = "-"; + if ((perf_results.get())[best_algo_idx].mathType == + CUDNN_TENSOR_OP_MATH) { + math_type_str = "+"; + } + VLOG(3) << " algo: " << (perf_results.get())[best_algo_idx].algo + << ", TC" << math_type_str + << ", time: " << (perf_results.get())[best_algo_idx].time << " ms" + << ", wksp = " << (perf_results.get())[best_algo_idx].memory + << ", status = " << (perf_results.get())[best_algo_idx].status; +*/ + auto max_bwd_filt_algos = MaxBackwardFilterAlgos(args.handle); + std::vector bwd_filt_results( + max_bwd_filt_algos); + int actual_bwd_filter_algos = 0; + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), bwd_filt_results.size(), + &actual_bwd_filter_algos, bwd_filt_results.data())); + bwd_filt_results.resize(actual_bwd_filter_algos); + AlgoFinalSelect( + bwd_filt_results, "backprop-to-filter", -1, workspace_size_limit, + &algo, deterministic); + workspace_size = GetWorkspaceSize(args, algo); + if (workspace_size > workspace_size_limit) { + workspace_size = workspace_size_limit; + } + #else + VLOG(3) << "=======cudnnGetConvolutionBackwardFilterAlgorithm====="; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -420,8 +523,10 @@ struct SearchAlgorithm { workspace_size_limit, &algo)); #endif } else if (deterministic) { + VLOG(3) << "======choose deterministic algo======"; return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { + VLOG(3) << "========Get cache algo==========="; auto& dev_ctx = ctx.template device_context(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 7e85cb57f339331d5dd4233c2cad562c56d1d3af..5c905bf22728ca1bf4cc020c91111427aef3c47e 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnGetVersion); \ __macro(cudnnFindConvolutionForwardAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithm); \ __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ __macro(cudnnGetErrorString); \ __macro(cudnnCreateDropoutDescriptor); \ @@ -194,7 +195,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnBatchNormalizationForwardTrainingEx); \ __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ __macro(cudnnBatchNormalizationBackwardEx); \ - __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); + __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount); CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif