From c331e2ce2031d68a553bc9469a07c30d718438f3 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Mon, 7 Nov 2022 22:32:20 +0800
Subject: [PATCH] Define ConvRunner to wrapper the call of cudnn conv
 functions. (#47576)

* Define ConvRunner to wrapper the call of cudnn conv functions.

* Use ConvKind in SearchAlgorithm.
---
 paddle/phi/kernels/gpudnn/conv_cudnn_v7.h     | 161 ++++++++++++-
 paddle/phi/kernels/gpudnn/conv_gpudnn_base.h  |   4 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 220 +++++++----------
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |  36 ++-
 .../gpudnn/conv_transpose_grad_kernel.cu      | 221 ++++++++----------
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  38 ++-
 6 files changed, 367 insertions(+), 313 deletions(-)
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
index 60d5854340f..12afa223f16 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
@@ -115,7 +115,7 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
   }
 }
 
-template <typename PerfT>
+template <ConvKind CK>
 struct SearchAlgorithmBase {};
 
 // cuDNN convolution forward algorithm searcher, consisted of three searching
@@ -123,9 +123,10 @@ struct SearchAlgorithmBase {};
 // As well as one workspace size acquirsition function with respect to
 // the chosen alogrithm.
 template <>
-struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
+struct SearchAlgorithmBase<ConvKind::kForward> {
   using PerfT = cudnnConvolutionFwdAlgoPerf_t;
   using AlgoT = cudnnConvolutionFwdAlgo_t;
+
   constexpr static phi::autotune::AlgorithmType kAlgoType =
       phi::autotune::AlgorithmType::kConvForward;
 
@@ -296,9 +297,10 @@ struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
 // As well as one workspace size acquirsition function with
 // respect to the chosen alogrithm.
 template <>
-struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
+struct SearchAlgorithmBase<ConvKind::kBackwardData> {
   using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
   using AlgoT = cudnnConvolutionBwdDataAlgo_t;
+
   constexpr static phi::autotune::AlgorithmType kAlgoType =
       phi::autotune::AlgorithmType::kConvBackwardData;
 
@@ -478,9 +480,10 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
 // exhaustive_search mode. As well as one workspace size acquirsition function
 // with respect to the chosen alogrithm.
 template <>
-struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
+struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
   using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
   using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
+
   constexpr static phi::autotune::AlgorithmType kAlgoType =
       phi::autotune::AlgorithmType::kConvBackwardFilter;
 
@@ -684,9 +687,9 @@ struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
   }
 };
 
-template <typename PerfT>
-struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
-  using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;
+template <ConvKind CK>
+struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
+  using AlgoT = typename SearchAlgorithmBase<CK>::AlgoT;
 
   template <typename T>
   static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
@@ -700,7 +703,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
     SetConvMathType(ctx, dtype, args.cdesc);
 
     if (deterministic) {
-      result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
+      result = SearchAlgorithmBase<CK>::FindAlgoDeterministic(args);
     } else {
       // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
       // 2. Once turning on auto-tune, run heuristic (default) before
@@ -710,7 +713,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
       //    default mode for the rest.
       auto key = args.ConvertToConvCacheKey<T>();
       auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
-          SearchAlgorithmBase<PerfT>::kAlgoType);
+          SearchAlgorithmBase<CK>::kAlgoType);
       bool find_in_cache = cache.Find(key);
       if (find_in_cache) {
         auto t = cache.Get(key);
@@ -727,7 +730,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
           // Once autotune is enabled, the autotuned result can rewrite the
           // previous result in cache found by heuristic method.
           result =
-              SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
+              SearchAlgorithmBase<CK>::template FindAlgoExhaustiveSearch<T>(
                   args, ctx);
           cache.Set(key,
                     phi::autotune::ConvAutoTuneResult(
@@ -735,7 +738,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
                         result.workspace_size,
                         true));
         } else if (!find_in_cache) {
-          result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
+          result = SearchAlgorithmBase<CK>::FindAlgoHeuristic(args, ctx);
           cache.Set(key,
                     phi::autotune::ConvAutoTuneResult(
                         static_cast<int64_t>(result.algo),
@@ -744,7 +747,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
         }
       }
     }
-    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
+    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<CK>::GetPerfName()
             << "] exhaustive_search=" << exhaustive_search
             << ", use_autotune=" << use_autotune
             << ", deterministic=" << deterministic
@@ -783,4 +786,138 @@ struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
   }
 };
 
+template <typename T, ConvKind CK>
+struct ConvRunner {};
+
+template <typename T>
+struct ConvRunner<T, ConvKind::kForward> {
+  static void Apply(
+      const phi::GPUContext& ctx,
+      const ConvArgs& args,
+      const SearchResult<cudnnConvolutionFwdAlgo_t>& search_result,
+      const T* input_ptr,
+      const T* filter_ptr,
+      T* output_ptr,
+      int groups,
+      int group_offset_in,
+      int group_offset_filter,
+      int group_offset_out,
+      size_t workspace_size,
+      phi::DnnWorkspaceHandle* workspace_handle,
+      bool use_addto = false) {
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
+
+    auto cudnn_handle = ctx.cudnn_handle();
+    for (int i = 0; i < groups; i++) {
+      workspace_handle->RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnConvolutionForward(
+                cudnn_handle,
+                &alpha,
+                args.idesc.desc(),
+                input_ptr + i * group_offset_in,
+                args.wdesc.desc(),
+                filter_ptr + i * group_offset_filter,
+                args.cdesc.desc(),
+                search_result.algo,
+                workspace_ptr,
+                workspace_size,
+                &beta,
+                args.odesc.desc(),
+                output_ptr + i * group_offset_out));
+          },
+          workspace_size);
+    }
+  }
+};
+
+template <typename T>
+struct ConvRunner<T, ConvKind::kBackwardData> {
+  static void Apply(
+      const phi::GPUContext& ctx,
+      const ConvArgs& args,
+      const SearchResult<cudnnConvolutionBwdDataAlgo_t>& search_result,
+      const T* output_grad_ptr,
+      const T* filter_ptr,
+      T* input_grad_ptr,
+      int groups,
+      int group_offset_in,
+      int group_offset_filter,
+      int group_offset_out,
+      size_t workspace_size,
+      phi::DnnWorkspaceHandle* workspace_handle,
+      bool use_addto = false) {
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
+
+    auto cudnn_handle = ctx.cudnn_handle();
+    for (int i = 0; i < groups; i++) {
+      workspace_handle->RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                phi::dynload::cudnnConvolutionBackwardData(
+                    cudnn_handle,
+                    &alpha,
+                    args.wdesc.desc(),
+                    filter_ptr + i * group_offset_filter,
+                    args.odesc.desc(),
+                    output_grad_ptr + i * group_offset_out,
+                    args.cdesc.desc(),
+                    search_result.algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args.idesc.desc(),
+                    input_grad_ptr + i * group_offset_in));
+          },
+          workspace_size);
+    }
+  }
+};
+
+template <typename T>
+struct ConvRunner<T, ConvKind::kBackwardFilter> {
+  static void Apply(
+      const phi::GPUContext& ctx,
+      const ConvArgs& args,
+      const SearchResult<cudnnConvolutionBwdFilterAlgo_t>& search_result,
+      const T* output_grad_ptr,
+      const T* input_ptr,
+      T* filter_grad_ptr,
+      int groups,
+      int group_offset_in,
+      int group_offset_filter,
+      int group_offset_out,
+      size_t workspace_size,
+      phi::DnnWorkspaceHandle* workspace_handle,
+      bool use_addto = false) {
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
+
+    auto cudnn_handle = ctx.cudnn_handle();
+    for (int i = 0; i < groups; i++) {
+      workspace_handle->RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                phi::dynload::cudnnConvolutionBackwardFilter(
+                    cudnn_handle,
+                    &alpha,
+                    args.idesc.desc(),
+                    input_ptr + i * group_offset_in,
+                    args.odesc.desc(),
+                    output_grad_ptr + i * group_offset_out,
+                    args.cdesc.desc(),
+                    search_result.algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args.wdesc.desc(),
+                    filter_grad_ptr + i * group_offset_filter));
+          },
+          workspace_size);
+    }
+  }
+};
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
index 537fee8a682..8f3604ed42e 100644
--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -34,7 +34,9 @@ template <typename T>
 using ScalingParamType =
     typename paddle::platform::CudnnDataType<T>::ScalingParamType;
 
-// As the container of searchAlgorithm::Find() result.
+enum class ConvKind { kForward = 1, kBackwardData = 2, kBackwardFilter = 3 };
+
+// The container of SearchAlgorithm::Find() result.
 template <typename AlgoT>
 struct SearchResult {
   SearchResult() {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index a6934596332..6d7c486a75f 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -376,7 +376,7 @@ void ConvCudnnGradKernel(const Context& ctx,
     bwd_result.algo = search1::Find<T>(
         args1, exhaustive_search, deterministic, workspace_size, ctx);
 #else
-    using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
     workspace_size = std::max(workspace_size, bwd_result.workspace_size);
 #endif
@@ -401,7 +401,7 @@ void ConvCudnnGradKernel(const Context& ctx,
     filter_result.algo = search2::Find<T>(
         args2, exhaustive_search, deterministic, workspace_size, ctx);
 #else
-    using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
     VLOG(3) << "filter algo: " << filter_result.algo << ", time "
@@ -481,30 +481,22 @@ void ConvCudnnGradKernel(const Context& ctx,
           },
           workspace_size);
     }
-
 #else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::cudnnConvolutionBackwardData(
-                    handle,
-                    &alpha,
-                    args1.wdesc.desc(),
-                    filter_data + i * group_offset_filter,
-                    args1.odesc.desc(),
-                    output_grad_data + i * group_offset_out,
-                    args1.cdesc.desc(),
-                    bwd_result.algo,
-                    cudnn_workspace_ptr,
-                    workspace_size,
-                    &beta,
-                    args1.idesc.desc(),
-                    transformed_input_grad_data + i * group_offset_in));
-          },
-          workspace_size);
-    }
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
+                                                  args1,
+                                                  bwd_result,
+                                                  output_grad_data,
+                                                  filter_data,
+                                                  transformed_input_grad_data,
+                                                  groups,
+                                                  group_offset_in,
+                                                  group_offset_filter,
+                                                  group_offset_out,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  use_addto);
 #endif
+
     if (!is_sys_pad) {
       std::vector<int> starts(transformed_input_channel.dims().size(), 0);
       std::vector<int> axes(transformed_input_channel.dims().size(), 0);
@@ -536,8 +528,6 @@ void ConvCudnnGradKernel(const Context& ctx,
     }
   }
 
-  // filter_grad do not use inplace addto.
-  ScalingParamType<T> beta_filter = 0.0f;
   // ------------------- cudnn conv backward filter ---------------------
   if (filter_grad) {
 // Because beta is zero, it is unnecessary to reset filter_grad.
@@ -562,27 +552,19 @@ void ConvCudnnGradKernel(const Context& ctx,
         },
         workspace_size);
 #else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
-                    handle,
-                    &alpha,
-                    args2.idesc.desc(),
-                    input_data + i * group_offset_in,
-                    args2.odesc.desc(),
-                    output_grad_data + i * group_offset_out,
-                    args2.cdesc.desc(),
-                    filter_result.algo,
-                    cudnn_workspace_ptr,
-                    workspace_size,
-                    &beta_filter,
-                    args2.wdesc.desc(),
-                    filter_grad_data + i * group_offset_filter));
-          },
-          workspace_size);
-    }
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
+                                                    args2,
+                                                    filter_result,
+                                                    output_grad_data,
+                                                    input_data,
+                                                    filter_grad_data,
+                                                    groups,
+                                                    group_offset_in,
+                                                    group_offset_filter,
+                                                    group_offset_out,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
 #endif
 
     if (compute_format == paddle::platform::DataLayout::kNHWC) {
@@ -952,7 +934,7 @@ void ConvCudnnGradGradKernel(
       fwd_result1.algo = search1::Find<T>(
           args1, exhaustive_search, false, workspace_size, ctx);
 #else
-      using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      using search1 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
       workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
 #endif
@@ -977,7 +959,7 @@ void ConvCudnnGradGradKernel(
       fwd_result2.algo = search2::Find<T>(
           args2, exhaustive_search, false, workspace_size, ctx);
 #else
-      using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      using search2 = SearchAlgorithm<ConvKind::kForward>;
       fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
       workspace_size = std::max(
           workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
@@ -1003,7 +985,7 @@ void ConvCudnnGradGradKernel(
     filter_result.algo = search3::Find<T>(
         args3, exhaustive_search, deterministic, workspace_size, ctx);
 #else
-    using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result =
         search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
     workspace_size = std::max(
@@ -1030,7 +1012,7 @@ void ConvCudnnGradGradKernel(
     data_result.algo = search4::Find<T>(
         args4, exhaustive_search, deterministic, workspace_size, ctx);
 #else
-    using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    using search4 = SearchAlgorithm<ConvKind::kBackwardData>;
     data_result =
         search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
     workspace_size = std::max(
@@ -1088,27 +1070,19 @@ void ConvCudnnGradGradKernel(
           },
           workspace_size);
 #else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  paddle::platform::dynload::cudnnConvolutionForward(
-                      handle,
-                      &alpha,
-                      args1.idesc.desc(),
-                      ddx + i * group_offset_in,
-                      args1.wdesc.desc(),
-                      w + i * group_offset_filter,
-                      args1.cdesc.desc(),
-                      fwd_result1.algo,
-                      workspace_ptr,
-                      workspace_size,
-                      &beta,
-                      args1.odesc.desc(),
-                      transformed_ddy_channel + i * group_offset_out));
-            },
-            workspace_size);
-      }
+      ConvRunner<T, ConvKind::kForward>::Apply(ctx,
+                                               args1,
+                                               fwd_result1,
+                                               ddx,
+                                               w,
+                                               transformed_ddy_channel,
+                                               groups,
+                                               group_offset_in,
+                                               group_offset_filter,
+                                               group_offset_out,
+                                               workspace_size,
+                                               &workspace_handle,
+                                               false);
 #endif
     }
     if (ddW) {
@@ -1134,27 +1108,19 @@ void ConvCudnnGradGradKernel(
           },
           workspace_size);
 #else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  paddle::platform::dynload::cudnnConvolutionForward(
-                      handle,
-                      &alpha,
-                      args2.idesc.desc(),
-                      x + i * group_offset_in,
-                      args2.wdesc.desc(),
-                      ddw + i * group_offset_filter,
-                      args2.cdesc.desc(),
-                      fwd_result2.algo,
-                      workspace_ptr,
-                      workspace_size,
-                      &alpha,
-                      args2.odesc.desc(),
-                      transformed_ddy_channel + i * group_offset_out));
-            },
-            workspace_size);
-      }
+      ConvRunner<T, ConvKind::kForward>::Apply(ctx,
+                                               args2,
+                                               fwd_result2,
+                                               x,
+                                               ddw,
+                                               transformed_ddy_channel,
+                                               groups,
+                                               group_offset_in,
+                                               group_offset_filter,
+                                               group_offset_out,
+                                               workspace_size,
+                                               &workspace_handle,
+                                               true);
 #endif
     }
     if (channel_last) {
@@ -1185,27 +1151,19 @@ void ConvCudnnGradGradKernel(
         },
         workspace_size);
 #else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
-                    handle,
-                    &alpha,
-                    args3.idesc.desc(),
-                    ddx + i * group_offset_in,
-                    args3.odesc.desc(),
-                    transformed_dy_channel + i * group_offset_out,
-                    args3.cdesc.desc(),
-                    filter_result.algo,
-                    workspace_ptr,
-                    workspace_size,
-                    &beta,
-                    args3.wdesc.desc(),
-                    dw + i * group_offset_filter));
-          },
-          workspace_size);
-    }
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
+                                                    args3,
+                                                    filter_result,
+                                                    transformed_dy_channel,
+                                                    ddx,
+                                                    dw,
+                                                    groups,
+                                                    group_offset_in,
+                                                    group_offset_filter,
+                                                    group_offset_out,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
 #endif
   }
 
@@ -1232,27 +1190,19 @@ void ConvCudnnGradGradKernel(
         },
         workspace_size);
 #else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::cudnnConvolutionBackwardData(
-                    handle,
-                    &alpha,
-                    args4.wdesc.desc(),
-                    ddw + i * group_offset_filter,
-                    args4.odesc.desc(),
-                    transformed_dy_channel + i * group_offset_out,
-                    args4.cdesc.desc(),
-                    data_result.algo,
-                    workspace_ptr,
-                    workspace_size,
-                    &beta,
-                    args4.idesc.desc(),
-                    transformed_dx + i * group_offset_in));
-          },
-          workspace_size);
-    }
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
+                                                  args4,
+                                                  data_result,
+                                                  transformed_dy_channel,
+                                                  ddw,
+                                                  transformed_dx,
+                                                  groups,
+                                                  group_offset_in,
+                                                  group_offset_filter,
+                                                  group_offset_out,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  false);
 #endif
 
     if (!is_sys_pad) {
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index a393cc278f2..ba4cc129907 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -315,7 +315,7 @@ void ConvCudnnKernel(const Context& ctx,
       args, exhaustive_search, deterministic, workspace_size, ctx);
 #else
   SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
-  using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+  using search = SearchAlgorithm<ConvKind::kForward>;
   fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
   workspace_size = fwd_result.workspace_size;
 #endif
@@ -359,27 +359,19 @@ void ConvCudnnKernel(const Context& ctx,
       },
       workspace_size);
 #else
-  for (int i = 0; i < groups; i++) {
-    workspace_handle.RunFunc(
-        [&](void* workspace_ptr) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              paddle::platform::dynload::cudnnConvolutionForward(
-                  handle,
-                  &alpha,
-                  args.idesc.desc(),
-                  input_data + i * group_offset_in,
-                  args.wdesc.desc(),
-                  filter_data + i * group_offset_filter,
-                  args.cdesc.desc(),
-                  fwd_result.algo,
-                  workspace_ptr,
-                  workspace_size,
-                  &beta,
-                  args.odesc.desc(),
-                  output_data + i * group_offset_out));
-        },
-        workspace_size);
-  }
+  ConvRunner<T, ConvKind::kForward>::Apply(ctx,
+                                           args,
+                                           fwd_result,
+                                           input_data,
+                                           filter_data,
+                                           output_data,
+                                           groups,
+                                           group_offset_in,
+                                           group_offset_filter,
+                                           group_offset_out,
+                                           workspace_size,
+                                           &workspace_handle,
+                                           false);
 #endif
 
   if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index f2c3cd0cc69..78961c86b07 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -227,7 +227,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     fwd_result.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
-    using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+    using search1 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
@@ -252,7 +252,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     filter_result.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
-    using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
@@ -269,9 +269,9 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
   ScalingParamType<T> beta = 0.0f;
   auto workspace_handle = ctx.cudnn_workspace_handle();
   if (dx) {
+#ifdef PADDLE_WITH_HIP
     // Because beta is zero, it is unnecessary to reset dx.
     for (int g = 0; g < groups; g++) {
-#ifdef PADDLE_WITH_HIP
       auto cudnn_func = [&](void* cudnn_workspace) {
         PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::miopenConvolutionForward(handle,
@@ -288,26 +288,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                               cudnn_workspace,
                                               workspace_size));
       };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
 #else   // PADDLE_WITH_HIP
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cudnnConvolutionForward(handle,
-                                             &alpha,
-                                             args1.idesc.desc(),
-                                             dout_data + dout_offset * g,
-                                             args1.wdesc.desc(),
-                                             filter_data + filter_offset * g,
-                                             args1.cdesc.desc(),
-                                             fwd_result.algo,
-                                             cudnn_workspace,
+    ConvRunner<T, ConvKind::kForward>::Apply(ctx,
+                                             args1,
+                                             fwd_result,
+                                             dout_data,
+                                             filter_data,
+                                             dx_data,
+                                             groups,
+                                             dout_offset,
+                                             filter_offset,
+                                             x_offset,
                                              workspace_size,
-                                             &beta,
-                                             args1.odesc.desc(),
-                                             dx_data + x_offset * g));
-      };
+                                             &workspace_handle,
+                                             false);
 #endif  // PADDLE_WITH_HIP
-      workspace_handle.RunFunc(cudnn_func, workspace_size);
-    }
 
     if (data_layout == GPUDNNDataLayout::kNHWC) {
       DenseTensor dx_transpose;
@@ -330,8 +327,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
   if (dfilter) {
     // Because beta is zero, it is unnecessary to reset dfilter.
     // Gradient with respect to the filter
-    for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
+    for (int g = 0; g < groups; g++) {
       auto cudnn_func = [&](void* cudnn_workspace) {
         PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
             handle,
@@ -348,26 +345,23 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
             cudnn_workspace,
             workspace_size));
       };
-#else   // PADDLE_WITH_HIP
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
-            handle,
-            &alpha,
-            args2.idesc.desc(),
-            dout_data + dout_offset * g,
-            args2.odesc.desc(),
-            x_data + x_offset * g,
-            args2.cdesc.desc(),
-            filter_result.algo,
-            cudnn_workspace,
-            workspace_size,
-            &beta,
-            args2.wdesc.desc(),
-            dfilter_data + filter_offset * g));
-      };
-#endif  // PADDLE_WITH_HIP
       workspace_handle.RunFunc(cudnn_func, workspace_size);
     }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
+                                                    args2,
+                                                    filter_result,
+                                                    x_data,
+                                                    dout_data,
+                                                    dfilter_data,
+                                                    groups,
+                                                    dout_offset,
+                                                    filter_offset,
+                                                    x_offset,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
+#endif  // PADDLE_WITH_HIP
   }
 }
 
@@ -704,7 +698,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     bwd_result1.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
-    using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
 #endif
@@ -726,7 +720,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     bwd_result2.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
-    using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
     bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
@@ -751,7 +745,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     filter_result.algo =
         search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
 #else
-    using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
     filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
@@ -777,7 +771,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     fwd_result.algo =
         search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
 #else
-    using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+    using search4 = SearchAlgorithm<ConvKind::kForward>;
     fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
@@ -815,8 +809,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 
   if (ddout) {
     ddx_ = transformed_ddx.data<T>();
-    for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
@@ -835,30 +829,25 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 workspace_size));
           },
           workspace_size);
+    }
 #else   // PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
-                handle,
-                &alpha,
-                args1.wdesc.desc(),
-                filter_ + i * group_offset_filter,
-                args1.odesc.desc(),
-                ddx_ + i * group_offset_in,
-                args1.cdesc.desc(),
-                bwd_result1.algo,
-                workspace_ptr,
-                workspace_size,
-                &beta,
-                args1.idesc.desc(),
-                transformed_ddout_channel_ + i * group_offset_out));
-          },
-          workspace_size);
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
+                                                  args1,
+                                                  bwd_result1,
+                                                  ddx_,
+                                                  filter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  false);
 #endif  // PADDLE_WITH_HIP
-    }
 
-    for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
       // MIOPEN ONLY support beta to be 0.0f
       DenseTensor conv_x_ddfilter(dout.type());
       conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
@@ -893,27 +882,22 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
           &beta,
           args2.idesc.desc(),
           transformed_ddout_channel_ + i * group_offset_out));
+    }
 #else   // PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
-                handle,
-                &alpha,
-                args2.wdesc.desc(),
-                ddfilter_ + i * group_offset_filter,
-                args2.odesc.desc(),
-                x_ + i * group_offset_in,
-                args2.cdesc.desc(),
-                bwd_result2.algo,
-                workspace_ptr,
-                workspace_size,
-                &alpha,
-                args2.idesc.desc(),
-                transformed_ddout_channel_ + i * group_offset_out));
-          },
-          workspace_size);
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
+                                                  args2,
+                                                  bwd_result2,
+                                                  x_,
+                                                  ddfilter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  true);
 #endif  // PADDLE_WITH_HIP
-    }
 
     if ((!is_sys_pad) && (!channel_last)) {
       if (strides.size() == 2U) {
@@ -947,8 +931,8 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   T* transformed_dout_channel_ = transformed_dout.data<T>();
   if (dfilter) {
     ddx_ = transformed_ddx_channel.data<T>();
-    for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
@@ -968,33 +952,28 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                     workspace_size));
           },
           workspace_size);
+    }
 #else   // PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
-                handle,
-                &alpha,
-                args3.idesc.desc(),
-                transformed_dout_channel_ + i * group_offset_out,
-                args3.odesc.desc(),
-                ddx_ + i * group_offset_in,
-                args3.cdesc.desc(),
-                filter_result.algo,
-                workspace_ptr,
-                workspace_size,
-                &beta,
-                args3.wdesc.desc(),
-                dfilter_ + i * group_offset_filter));
-          },
-          workspace_size);
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(ctx,
+                                                    args3,
+                                                    filter_result,
+                                                    ddx_,
+                                                    transformed_dout_channel_,
+                                                    dfilter_,
+                                                    groups,
+                                                    group_offset_out,
+                                                    group_offset_filter,
+                                                    group_offset_in,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
 #endif  // PADDLE_WITH_HIP
-    }
   }
 
   if (dx) {
     ddfilter_ = ddfilter.data<T>();
-    for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
@@ -1013,27 +992,23 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 workspace_size));
           },
           workspace_size);
+    }
 #else   // PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward(
-                handle,
-                &alpha,
-                args4.idesc.desc(),
-                transformed_dout_channel_ + i * group_offset_out,
-                args4.wdesc.desc(),
-                ddfilter_ + i * group_offset_filter,
-                args4.cdesc.desc(),
-                fwd_result.algo,
-                workspace_ptr,
-                workspace_size,
-                &beta,
-                args4.odesc.desc(),
-                transformed_dx_ + i * group_offset_in));
-          },
-          workspace_size);
+    ConvRunner<T, ConvKind::kForward>::Apply(ctx,
+                                             args4,
+                                             fwd_result,
+                                             transformed_dout_channel_,
+                                             ddfilter_,
+                                             transformed_dx_,
+                                             groups,
+                                             group_offset_out,
+                                             group_offset_filter,
+                                             group_offset_in,
+                                             workspace_size,
+                                             &workspace_handle,
+                                             false);
 #endif  // PADDLE_WITH_HIP
-    }
+
     if (channel_last) {
       TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
     }
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 13e668f7602..593114ac65f 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -227,7 +227,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
       search::Find<T>(args, false, deterministic, workspace_size, ctx);
 #else
   SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
-  using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+  using search = SearchAlgorithm<ConvKind::kBackwardData>;
   bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
   workspace_size =
       std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
@@ -240,8 +240,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   ScalingParamType<T> alpha = 1.0f;
   ScalingParamType<T> beta = 0.0f;
   auto workspace_handle = ctx.cudnn_workspace_handle();
-  for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
+  for (int g = 0; g < groups; g++) {
     auto cudnn_func = [&](void* cudnn_workspace) {
       PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
           handle,
@@ -258,26 +258,24 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
           cudnn_workspace,
           workspace_size));
     };
-#else   // PADDLE_WITH_HIP
-    auto cudnn_func = [&](void* cudnn_workspace) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
-          handle,
-          &alpha,
-          args.wdesc.desc(),
-          filter_data + filter_offset * g,
-          args.odesc.desc(),
-          x_data + x_offset * g,
-          args.cdesc.desc(),
-          bwd_result.algo,
-          cudnn_workspace,
-          workspace_size,
-          &beta,
-          args.idesc.desc(),
-          transformed_out_data + out_offset * g));
-    };
-#endif  // PADDLE_WITH_HIP
     workspace_handle.RunFunc(cudnn_func, workspace_size);
   }
+#else   // PADDLE_WITH_HIP
+  ConvRunner<T, ConvKind::kBackwardData>::Apply(ctx,
+                                                args,
+                                                bwd_result,
+                                                x_data,
+                                                filter_data,
+                                                transformed_out_data,
+                                                groups,
+                                                out_offset,
+                                                filter_offset,
+                                                x_offset,
+                                                workspace_size,
+                                                &workspace_handle,
+                                                false);
+#endif  // PADDLE_WITH_HIP
+
   if (!is_sys_pad && strides.size() == 2U) {
     funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
   } else if (!is_sys_pad && strides.size() == 3U) {
-- 
GitLab