diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index fac8e24251033c301c911f35dcfd0ddb82b713ce..55502eaf4e54957edb1f7c3cfa9616be3f99cf6a 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/operator_kernel_configs.h" @@ -90,6 +91,61 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) { return out; } +inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) { + int max_algos = 0; +#if CUDNN_VERSION_MIN(7, 0, 1) + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + cudnn_handle, &max_algos)); +#endif + return max_algos; +} + +template +void ChooseAlgo(const std::vector& perf_results, + size_t workspace_byte, AlgoType* algo) { + VLOG(3) << "=========BwdFilterAlgo Perf result========="; + for (const auto& result : perf_results) { + auto math_type_str = "False"; + if (result.mathType == CUDNN_TENSOR_OP_MATH) { + math_type_str = "True"; + } + VLOG(3) << " algo: " << result.algo << ", TensorCore: " << math_type_str + << ", time: " << result.time << " ms" + << ", wksp = " << result.memory << ", status = " << result.status; + } + + for (size_t i = 0; i != perf_results.size(); ++i) { + const auto& result = perf_results[i]; + if (result.status == CUDNN_STATUS_SUCCESS && + (result.memory <= workspace_byte)) { + if ((result.mathType == CUDNN_TENSOR_OP_MATH) && + (i != perf_results.size() - 1)) { + const auto& next_result = perf_results[i + 1]; + if (next_result.status == CUDNN_STATUS_SUCCESS && + next_result.algo == result.algo && + next_result.memory == result.memory && + next_result.mathType != CUDNN_TENSOR_OP_MATH && + next_result.time < 1.01 * result.time) { + // Skip over this result- it's not really a Tensor Core algo. + // Because it is only 1% performance difference. + // Prefer to choose the next equivalent non-Tensor Core algo. + continue; + } + } + *algo = result.algo; + auto math_type_str = "0"; + if (result.mathType == CUDNN_TENSOR_OP_MATH) { + math_type_str = "1"; + } + VLOG(3) << " choose algo: " << result.algo << ", TC: " << math_type_str + << ", time: " << result.time << " ms" + << ", wksp = " << result.memory << ", status = " << result.status; + return; + } + } +} + using framework::ConvSearchCache; struct ConvArgs { @@ -401,7 +457,6 @@ struct SearchAlgorithm { bool deterministic, const framework::ExecutionContext& ctx) { auto dtype = platform::CudnnDataType::type; - bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF); size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; bool has_got_workspace_size = true; @@ -422,7 +477,7 @@ struct SearchAlgorithm { #endif algo_t algo; - if (!exhaustive && !deterministic) { + if (!exhaustive_search && !deterministic) { #if CUDNN_VERSION >= 7001 using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; int perf_count; @@ -462,34 +517,57 @@ struct SearchAlgorithm { VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:" << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::array perf_stat; - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + if (dtype != CUDNN_DATA_HALF) { + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, + static_cast(args.cudnn_dtype), [&]() { + int returned_algo_count; + std::array perf_stat; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload:: + cudnnFindConvolutionBackwardFilterAlgorithmEx( + args.handle, args.idesc.desc(), args.x->data(), + args.odesc.desc(), args.o->data(), + args.cdesc.desc(), args.wdesc.desc(), + const_cast(args.w->data()), + kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, + perf_stat.data(), cudnn_workspace_ptr, + workspace_size_limit)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, + workspace_size_limit); + + VLOG(3) + << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time + << " " << stat.memory; + } + return perf_stat[0].algo; + }); + } else { + auto max_algos = MaxBwdFilterAlgos(args.handle); + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, + static_cast(args.cudnn_dtype), [&]() { + algo_t chosen_algo; + std::vector perf_results(max_algos); + int actual_algos = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload:: - cudnnFindConvolutionBackwardFilterAlgorithmEx( - args.handle, args.idesc.desc(), args.x->data(), - args.odesc.desc(), args.o->data(), + cudnnFindConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), - const_cast(args.w->data()), - kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - return perf_stat[0].algo; - }); + perf_results.size(), &actual_algos, + perf_results.data())); + perf_results.resize(actual_algos); + ChooseAlgo(perf_results, workspace_size_limit, + &chosen_algo); + return chosen_algo; + }); + } } VLOG(3) << "choose algo " << algo; return algo; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 88b545b48e5328883c1554c0efc75140006ce787..db84b8731f9ca467c4521221a3dbe0b1fc61b597 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnGetVersion); \ __macro(cudnnFindConvolutionForwardAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithm); \ __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ __macro(cudnnGetErrorString); \ __macro(cudnnCreateDropoutDescriptor); \ @@ -177,7 +178,8 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnCTCLoss); \ __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \ __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \ - __macro(cudnnGetConvolutionForwardAlgorithm_v7); + __macro(cudnnGetConvolutionForwardAlgorithm_v7); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount); CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif