From c10157a5df5859ec11c93ff23fa1fb1ad35c87b9 Mon Sep 17 00:00:00 2001 From: wangchaochaohu Date: Mon, 3 Jun 2019 06:47:49 -0500 Subject: [PATCH] revise the cudnn conv choose algorithm to improve the performance(mask rcnn benchmark) (#17753) * revise conv layer cudnn algo choose test=develop * update for code style test=develop * update for code style test=develop --- paddle/fluid/operators/conv_cudnn_op.cu.cc | 60 ++++++++++++++++++++++ paddle/fluid/platform/dynload/cudnn.h | 23 +++++---- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 158d6ced2..4c3c7b2b2 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -166,10 +166,23 @@ class CUDNNConvOpKernel : public framework::OpKernel { // TODO(dangqingqing) simplify the following code by SearchAlgorithm in // conv_cudnn_helper.h if ((!exhaustive_search) && (!half_float)) { +#if CUDNN_VERSION >= 7001 + using perf_t = cudnnConvolutionFwdAlgoPerf_t; + int perf_count; + int best_algo_idx = 0; + std::unique_ptr perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count, + perf_results.get())); + algo = (perf_results.get())[best_algo_idx].algo; +#else CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &algo)); +#endif + VLOG(3) << "cuDNN forward algo " << algo; } else if (exhaustive_search && (!half_float)) { AlgorithmsCache& algo_cache = @@ -388,6 +401,37 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } else if (FLAGS_cudnn_deterministic) { data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { +#if CUDNN_VERSION >= 7001 + using perf_t = cudnnConvolutionBwdDataAlgoPerf_t; + int perf_count; + int best_algo_idx = 0; + std::unique_ptr perf_results( + new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input + // differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_desc, kNUM_CUDNN_BWD_DATA_ALGS, &perf_count, + perf_results.get())); + data_algo = (perf_results.get())[best_algo_idx].algo; + int stride_dim = input->dims().size() - 2; + bool blacklist = + std::any_of(strides.begin(), strides.begin() + stride_dim, + [=](int n) { return n != 1; }); + if (blacklist && (static_cast( + perf_results[best_algo_idx].algo) == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || + static_cast( + perf_results[best_algo_idx].algo) == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { + data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } +#else CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, @@ -400,6 +444,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_input_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &data_algo)); +#endif } CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( @@ -437,12 +482,27 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } else if (FLAGS_cudnn_deterministic) { filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { +#if CUDNN_VERSION >= 7001 + using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; + int perf_count; + int best_algo_idx = 0; + std::unique_ptr perf_results( + new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); + + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( + handle, cudnn_input_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_filter_desc, kNUM_CUDNN_BWD_FILTER_ALGS, + &perf_count, perf_results.get())); + filter_algo = (perf_results.get())[best_algo_idx].algo; +#else CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, cudnn_filter_desc, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, &filter_algo)); +#endif } CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 3008c1669..67e2a18dd 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -172,16 +172,19 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif #if CUDNN_VERSION >= 7001 -#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ - __macro(cudnnSetConvolutionGroupCount); \ - __macro(cudnnSetConvolutionMathType); \ - __macro(cudnnConvolutionBiasActivationForward); \ - __macro(cudnnCreateCTCLossDescriptor); \ - __macro(cudnnDestroyCTCLossDescriptor); \ - __macro(cudnnGetCTCLossDescriptor); \ - __macro(cudnnSetCTCLossDescriptor); \ - __macro(cudnnGetCTCLossWorkspaceSize); \ - __macro(cudnnCTCLoss); +#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(cudnnSetConvolutionGroupCount); \ + __macro(cudnnSetConvolutionMathType); \ + __macro(cudnnConvolutionBiasActivationForward); \ + __macro(cudnnCreateCTCLossDescriptor); \ + __macro(cudnnDestroyCTCLossDescriptor); \ + __macro(cudnnGetCTCLossDescriptor); \ + __macro(cudnnSetCTCLossDescriptor); \ + __macro(cudnnGetCTCLossWorkspaceSize); \ + __macro(cudnnCTCLoss); \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \ + __macro(cudnnGetConvolutionForwardAlgorithm_v7); CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif -- GitLab