[CUDNN8 support] : support CUDNN8 (#25664)

* cunn8 support test=develop * fix ci error test=develop

[CUDNN8 support] : support CUDNN8 (#25664)
* cunn8 support test=develop * fix ci error test=develop
358bc06c · Zhaolong Xing · GitHub · 5970871a · 358bc06c · 358bc06c
5 changed file
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -60,9 +60,8 @@ else()
    set(CUDNN_FOUND OFF)
 endif()

-if(CUDNN_FOUND)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
-
+macro(find_cudnn_version cudnn_header_file) 
+    file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS)
    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)

    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
@@ -93,10 +92,15 @@ if(CUDNN_FOUND)
            math(EXPR CUDNN_VERSION
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
+            message(STATUS "Current cuDNN header is ${cudnn_header_file} "
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
        endif()
-
-        message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
-            "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
-
    endif()
+endmacro()
+
+if(CUDNN_FOUND)
+  find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h) 
+  if (NOT CUDNN_MAJOR_VERSION) 
+    find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h) 
+  endif()
 endif()
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -162,19 +162,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
      workspace_size = GetWorkspaceSize(args, algo);

      if (workspace_size > workspace_size_limit) {
-        has_got_workspace_size = false;
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-      }
-      if (!has_got_workspace_size) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-                args.handle, args.idesc.desc(), args.wdesc.desc(),
-                args.cdesc.desc(), args.odesc.desc(),
-                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
+        workspace_size_limit = workspace_size;
      }
 #else
      PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -303,19 +291,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 #endif
      workspace_size = GetWorkspaceSize(args, algo);
      if (workspace_size > workspace_size_limit) {
+        workspace_size_limit = workspace_size;
        has_got_workspace_size = false;
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-      }
-      if (!has_got_workspace_size) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-                args.handle, args.wdesc.desc(), args.odesc.desc(),
-                args.cdesc.desc(), args.idesc.desc(),
-                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
      }
 #else
      PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -432,19 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
      algo = (perf_results.get())[best_algo_idx].algo;
      workspace_size = GetWorkspaceSize(args, algo);
      if (workspace_size > workspace_size_limit) {
-        has_got_workspace_size = false;
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-      }
-      if (!has_got_workspace_size) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-                args.handle, args.idesc.desc(), args.odesc.desc(),
-                args.cdesc.desc(), args.wdesc.desc(),
-                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
+        workspace_size = workspace_size_limit;
      }
 #else
      PADDLE_ENFORCE_CUDA_SUCCESS(

--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    auto x_dims = framework::vectorize(transformed_input.dims());
    auto f_dims = framework::vectorize(filter->dims());
    if (!exhaustive_search) {
+      int perf_count;
+      int best_algo_idx = 0;
+      size_t tmp_size = 0;
+      std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
+          new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
              handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-              cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo));
+              cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
+              perf_results.get()));
+      algo = (perf_results.get())[best_algo_idx].algo;
      VLOG(3) << "cuDNN forward algo " << algo;
    } else {
      std::function<cudnnConvolutionFwdAlgo_t()> search_func =

--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
          out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
          out_strides[i].data()));
+
+      int perf_count;
+      int best_algo_idx = 0;
+      size_t tmp_size = 0;
+      std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
+          new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
              handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
-              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo[i]));
-      size_t tmp_size = 0;
+              kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get()));
+      algo[i] = (perf_results.get())[best_algo_idx].algo;
+
      PADDLE_ENFORCE_CUDA_SUCCESS(
          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
              handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
              algo[i], &tmp_size));
+
      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
    }
    cudnnActivationDescriptor_t cudnn_act_desc =

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnSetTensorNdDescriptor);                    \
  __macro(cudnnGetTensorNdDescriptor);                    \
  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
-  __macro(cudnnGetConvolutionForwardAlgorithm);           \
  __macro(cudnnCreateTensorDescriptor);                   \
  __macro(cudnnDestroyTensorDescriptor);                  \
  __macro(cudnnCreateFilterDescriptor);                   \
@@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnDropoutGetStatesSize);                     \
  __macro(cudnnSetDropoutDescriptor);                     \
  __macro(cudnnCreateRNNDescriptor);                      \
-  __macro(cudnnSetRNNDescriptor);                         \
  __macro(cudnnGetRNNParamsSize);                         \
  __macro(cudnnGetRNNWorkspaceSize);                      \
  __macro(cudnnGetRNNTrainingReserveSize);                \
@@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #if CUDNN_VERSION >= 3000
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

+// APIs available after R3:
+#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);          \
+  __macro(cudnnSetRNNDescriptor);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 // APIs available after R4:
 #if CUDNN_VERSION >= 4007
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
@@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
+
+#if CUDNN_VERSION >= 8000
+#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) __macro(cudnnSetRNNDescriptor_v8);
+CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle