From c17f9cf25fd42ab868983a85c03d8c9a2b4a007d Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Wed, 23 Sep 2020 19:09:28 +0800 Subject: [PATCH] [bug fix]:Memory increases after adapting the cudnn version to cudnn8 (#27436) * [bug fix]:Memory increases after adapting the cudnn version to 8 * [bug fix]cudnnGetConvolutionForwardAlgorithm not defined --- paddle/fluid/operators/conv_cudnn_helper.h | 30 ++++++++++++++++++- .../fluid/operators/fused/conv_fusion_op.cu | 10 ++++++- paddle/fluid/platform/dynload/cudnn.cc | 8 +++++ paddle/fluid/platform/dynload/cudnn.h | 1 + 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 25b45f281a..fac8e24251 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -162,7 +162,20 @@ struct SearchAlgorithm { workspace_size = GetWorkspaceSize(args, algo); if (workspace_size > workspace_size_limit) { +#if CUDNN_VERSION >= 8000 workspace_size_limit = workspace_size; +#else + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); +#endif } #else PADDLE_ENFORCE_CUDA_SUCCESS( @@ -291,8 +304,23 @@ struct SearchAlgorithm { #endif workspace_size = GetWorkspaceSize(args, algo); if (workspace_size > workspace_size_limit) { - workspace_size_limit = workspace_size; has_got_workspace_size = false; +#if CUDNN_VERSION >= 8000 + // There is no cudnnGetConvolutionBackwardDataAlgorithm in CUDNN 8 + // version. + workspace_size_limit = workspace_size; +#else + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); +#endif } #else PADDLE_ENFORCE_CUDA_SUCCESS( diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index b22f28fbbe..49fded886a 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -204,6 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto x_dims = framework::vectorize(transformed_input.dims()); auto f_dims = framework::vectorize(filter->dims()); if (!exhaustive_search) { +#if CUDNN_VERSION >= 8000 int perf_count; int best_algo_idx = 0; size_t tmp_size = 0; @@ -215,13 +216,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get())); algo = (perf_results.get())[best_algo_idx].algo; - VLOG(3) << "cuDNN forward algo " << algo; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); if (workspace_size_in_bytes > workspace_size_limit) workspace_size_limit = workspace_size_in_bytes; +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + VLOG(3) << "cuDNN forward algo " << algo; +#endif } else { std::function search_func = [&]() -> cudnnConvolutionFwdAlgo_t { diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index 1166dc5e4a..4c59fe5e9b 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -30,6 +30,10 @@ CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8 +CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP); +#endif + #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4 CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); #endif @@ -54,6 +58,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_R8 +CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP); +#endif + bool HasCUDNN() { std::call_once(cudnn_dso_flag, []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index fba4141764..dd0a2e1968 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -134,6 +134,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \ __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ __macro(cudnnSetRNNDescriptor); CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif -- GitLab