From f4ec1563be117df485d451f87b6eb5a0912f69d1 Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Mon, 19 Sep 2022 12:39:57 +0800
Subject: [PATCH] convfusion_cache (#46054)

---
 paddle/fluid/framework/conv_search_cache.h    |  6 +-
 .../fluid/framework/operator_kernel_configs.h | 10 ++
 .../fluid/operators/fused/conv_fusion_op.cu   | 96 ++++++++++---------
 3 files changed, 67 insertions(+), 45 deletions(-)
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index 4da2aeb4d04..1620c99ce85 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -55,7 +55,8 @@ class ConvSearchCache {
   AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* GetBackwardFilter() {
     return &backward_filter_cache_;
   }
-  AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetConvFusion() {
+  AlgorithmsCache<SearchFuseResult<cudnnConvolutionFwdAlgo_t>>*
+  GetConvFusion() {
     return &fusion_forward_cache_;
   }
 #endif
@@ -75,7 +76,8 @@ class ConvSearchCache {
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
   AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
   AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> backward_filter_cache_;
-  AlgorithmsCache<cudnnConvolutionFwdAlgo_t> fusion_forward_cache_;
+  AlgorithmsCache<SearchFuseResult<cudnnConvolutionFwdAlgo_t>>
+      fusion_forward_cache_;
 #endif
 };
 
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
index dc798868dc6..6a6419042f1 100644
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -24,6 +24,16 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename AlgoT>
+struct SearchFuseResult {
+  SearchFuseResult() {}
+  explicit SearchFuseResult(AlgoT a) : algo(a) {}
+
+  AlgoT algo = static_cast<AlgoT>(0);
+  float time = -1.f;
+  size_t workspace_size = 0;
+};
+
 // thread-safe.
 template <typename TAlgorithm>
 class AlgorithmsCache {
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 4f05e6b6e2f..6f0ebc2c7eb 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -35,6 +35,7 @@ using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
 using DataLayout = platform::DataLayout;
 using framework::AlgorithmsCache;
 using framework::ConvSearchCache;
+using framework::SearchFuseResult;
 
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
@@ -348,34 +349,35 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               &perf_count,
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
+#else
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
               handle,
               cudnn_input_desc,
               cudnn_filter_desc,
               cudnn_conv_desc,
               cudnn_output_desc,
-              algo,
-              &workspace_size_in_bytes));
-      if (workspace_size_in_bytes > workspace_size_limit)
-        workspace_size_limit = workspace_size_in_bytes;
-#else
+              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit,
+              &algo));
+#endif
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle,
               cudnn_input_desc,
               cudnn_filter_desc,
               cudnn_conv_desc,
               cudnn_output_desc,
-              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit,
-              &algo));
+              algo,
+              &workspace_size_in_bytes));
+      if (workspace_size_in_bytes > workspace_size_limit)
+        workspace_size_limit = workspace_size_in_bytes;
       VLOG(3) << "cuDNN forward algo " << algo;
-#endif
     } else {
-      std::function<cudnnConvolutionFwdAlgo_t()> search_func =
-          [&]() -> cudnnConvolutionFwdAlgo_t {
+      std::function<SearchFuseResult<cudnnConvolutionFwdAlgo_t>()> search_func =
+          [&]() -> SearchFuseResult<cudnnConvolutionFwdAlgo_t> {
         int returned_algo_count;
+        SearchFuseResult<cudnnConvolutionFwdAlgo_t> fwd_result;
         std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
             fwd_perf_stat;
         auto cudnn_find_func = [&](void* cudnn_workspace) {
@@ -402,11 +404,34 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
           VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
                   << stat.memory;
         }
-        return fwd_perf_stat[0].algo;
+
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+                handle,
+                cudnn_input_desc,
+                cudnn_filter_desc,
+                cudnn_conv_desc,
+                cudnn_output_desc,
+                fwd_perf_stat[0].algo,
+                &workspace_size_in_bytes));
+        // PADDLE_ENFORCE_LE(
+        //     workspace_size_in_bytes,
+        //     workspace_size_limit,
+        //     platform::errors::InvalidArgument(
+        //         "The actual workspace size to be allocated for cuDNN is
+        //         expected " "to be less than the limit. But received: the
+        //         actual workspace " "size = %d, limit = %d.",
+        //         workspace_size_in_bytes,
+        //         workspace_size_limit));
+
+        fwd_result.algo = fwd_perf_stat[0].algo;
+        fwd_result.workspace_size = workspace_size_in_bytes;
+        return fwd_result;
       };
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+      AlgorithmsCache<SearchFuseResult<cudnnConvolutionFwdAlgo_t>>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetConvFusion());
       int search_times = ctx.Attr<int>("search_times");
+      SearchFuseResult<cudnnConvolutionFwdAlgo_t> algo_result;
       search_times = std::max(
           static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
       // TODO(dangqingqing): Unify this if-else.
@@ -414,39 +439,24 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         // The searched algo will be cached by `search_times` times for
         // different input dimension. For other dimensions, select the algo
         // of closest area.
-        algo = algo_cache.GetAlgorithm(
+        algo_result = algo_cache.GetAlgorithm(
             x_dims[2] * x_dims[3], search_times, 0, search_func);
+        algo = algo_result.algo;
+        workspace_size_in_bytes = algo_result.workspace_size;
       } else {
-        algo = algo_cache.GetAlgorithm(x_dims,
-                                       f_dims,
-                                       strides,
-                                       paddings,
-                                       dilations,
-                                       0,
-                                       dtype,
-                                       search_func);
+        algo_result = algo_cache.GetAlgorithm(x_dims,
+                                              f_dims,
+                                              strides,
+                                              paddings,
+                                              dilations,
+                                              0,
+                                              dtype,
+                                              search_func);
+        algo = algo_result.algo;
+        workspace_size_in_bytes = algo_result.workspace_size;
       }
       VLOG(3) << "choose algo " << algo;
     }
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-            handle,
-            cudnn_input_desc,
-            cudnn_filter_desc,
-            cudnn_conv_desc,
-            cudnn_output_desc,
-            algo,
-            &workspace_size_in_bytes));
-    // PADDLE_ENFORCE_LE(
-    //     workspace_size_in_bytes,
-    //     workspace_size_limit,
-    //     platform::errors::InvalidArgument(
-    //         "The actual workspace size to be allocated for cuDNN is expected
-    //         " "to be less than the limit. But received: the actual workspace
-    //         " "size = %d, limit = %d.", workspace_size_in_bytes,
-    //         workspace_size_limit));
-
     if ((activation == "identity") && (!residual)) {
       // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
       // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-- 
GitLab