diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 98466d44fc0dd91ef0cc8e8eac2660c42a19267c..b68e1b4070c881aaa6770560bd81836403ed0da1 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -60,9 +60,8 @@ else() set(CUDNN_FOUND OFF) endif() -if(CUDNN_FOUND) - file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) - +macro(find_cudnn_version cudnn_header_file) + file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS) get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY) string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)" @@ -93,10 +92,15 @@ if(CUDNN_FOUND) math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") + message(STATUS "Current cuDNN header is ${cudnn_header_file} " + "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ") endif() - - message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. " - "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ") - endif() +endmacro() + +if(CUDNN_FOUND) + find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h) + if (NOT CUDNN_MAJOR_VERSION) + find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h) + endif() endif() diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index fadffaee71d21e736bcc0ab696e44277a85345eb..25b45f281a799ade12ec9cbfb8fb262dbc572196 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -162,19 +162,7 @@ struct SearchAlgorithm { workspace_size = GetWorkspaceSize(args, algo); if (workspace_size > workspace_size_limit) { - has_got_workspace_size = false; - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - } - if (!has_got_workspace_size) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.wdesc.desc(), - args.cdesc.desc(), args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size_limit = workspace_size; } #else PADDLE_ENFORCE_CUDA_SUCCESS( @@ -303,19 +291,8 @@ struct SearchAlgorithm { #endif workspace_size = GetWorkspaceSize(args, algo); if (workspace_size > workspace_size_limit) { + workspace_size_limit = workspace_size; has_got_workspace_size = false; - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - } - if (!has_got_workspace_size) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, args.wdesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.idesc.desc(), - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); } #else PADDLE_ENFORCE_CUDA_SUCCESS( @@ -432,19 +409,7 @@ struct SearchAlgorithm { algo = (perf_results.get())[best_algo_idx].algo; workspace_size = GetWorkspaceSize(args, algo); if (workspace_size > workspace_size_limit) { - has_got_workspace_size = false; - VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " - "the workspace size request(" - << workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - } - if (!has_got_workspace_size) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - args.handle, args.idesc.desc(), args.odesc.desc(), - args.cdesc.desc(), args.wdesc.desc(), - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + workspace_size = workspace_size_limit; } #else PADDLE_ENFORCE_CUDA_SUCCESS( diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 1b8360a3092f5640a4de83a038dee60bdfe8b83e..e8f371cb4877f343d108e8528345be03cd9b354b 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto x_dims = framework::vectorize(transformed_input.dims()); auto f_dims = framework::vectorize(filter->dims()); if (!exhaustive_search) { + int perf_count; + int best_algo_idx = 0; + size_t tmp_size = 0; + std::unique_ptr perf_results( + new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( + platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, - cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo)); + cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count, + perf_results.get())); + algo = (perf_results.get())[best_algo_idx].algo; VLOG(3) << "cuDNN forward algo " << algo; } else { std::function search_func = diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 9d6b9665f85ab528767ae8f65356d7f3901ad274..3529ff1f94aab259661640925f5096890dd95566 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( out_desc[i], cudnn_dtype, 4, out_dims[i].data(), out_strides[i].data())); + + int perf_count; + int best_algo_idx = 0; + size_t tmp_size = 0; + std::unique_ptr perf_results( + new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnGetConvolutionForwardAlgorithm( + platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, &algo[i])); - size_t tmp_size = 0; + kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get())); + algo[i] = (perf_results.get())[best_algo_idx].algo; + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], algo[i], &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } cudnnActivationDescriptor_t cudnn_act_desc = diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 96297ec8557bcaac08b105d362dbca2fb0dcd29b..0eb28f0c0c3561f98891ff2a0ab5a26a20b07fb4 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnSetTensorNdDescriptor); \ __macro(cudnnGetTensorNdDescriptor); \ __macro(cudnnGetConvolutionNdForwardOutputDim); \ - __macro(cudnnGetConvolutionForwardAlgorithm); \ __macro(cudnnCreateTensorDescriptor); \ __macro(cudnnDestroyTensorDescriptor); \ __macro(cudnnCreateFilterDescriptor); \ @@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnDropoutGetStatesSize); \ __macro(cudnnSetDropoutDescriptor); \ __macro(cudnnCreateRNNDescriptor); \ - __macro(cudnnSetRNNDescriptor); \ __macro(cudnnGetRNNParamsSize); \ __macro(cudnnGetRNNWorkspaceSize); \ __macro(cudnnGetRNNTrainingReserveSize); \ @@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #if CUDNN_VERSION >= 3000 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ - __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ - __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +// APIs available after R3: +#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ + __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnSetRNNDescriptor); +CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + // APIs available after R4: #if CUDNN_VERSION >= 4007 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ @@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif + +#if CUDNN_VERSION >= 8000 +#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) __macro(cudnnSetRNNDescriptor_v8); +CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + } // namespace dynload } // namespace platform } // namespace paddle