Update CUDNN Frontend API to v0.9.1 (#54949)

* Update CUDNN Frontend API to v0.9.1 - Remove old patches - Remove workarounds that are no longer needed * Fix test_switch_autotune

Update CUDNN Frontend API to v0.9.1 (#54949)
* Update CUDNN Frontend API to v0.9.1 - Remove old patches - Remove workarounds that are no longer needed * Fix test_switch_autotune
76b77d81 · Tian Zheng · GitHub · f1bffdac · 76b77d81 · 76b77d81
6 changed file
--- a/cmake/external/cudnn-frontend.cmake
+++ b/cmake/external/cudnn-frontend.cmake
@@ -28,24 +28,24 @@ endif()
 if((NOT DEFINED CUDNN_FRONTEND_NAME) OR (NOT DEFINED CUDNN_FRONTEND_URL))
  set(CUDNN_FRONTEND_VER
-      "1.23.2"
+      "v0.9.1"
      CACHE STRING "" FORCE)
  set(CUDNN_FRONTEND_NAME
      "cudnn-frontend"
      CACHE STRING "" FORCE)
  set(CUDNN_FRONTEND_URL
-      "https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.7.1.tar.gz"
+      "https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/${CUDNN_FRONTEND_VER}.tar.gz"
      CACHE STRING "" FORCE)
-  set(CUDNN_FRONTEND_CACHE_FILENAME "v0.7.1.tar.gz")
 endif()
-set(CUDNN_FRONTEND_URL_MD5 "d8f911df571f8b0d40226efa9c0150c8")
+set(CUDNN_FRONTEND_CACHE_FILENAME "${CUDNN_FRONTEND_VER}.tar.gz")
+set(CUDNN_FRONTEND_URL_MD5 "da7cbad1305427f687dd4fd737178f80")
 message(
  STATUS
    "CUDNN_FRONTEND_NAME: ${CUDNN_FRONTEND_NAME}, CUDNN_FRONTEND_URL: ${CUDNN_FRONTEND_URL}"
 )
-set(DIRENT_DOWNLOAD_DIR "${PADDLE_SOURCE_DIR}/third_party/cudnn-frontend")
+set(CUDNN_FRONTEND_DOWNLOAD_DIR
-# Version: v0.7.1
+    "${PADDLE_SOURCE_DIR}/third_party/cudnn-frontend")
 set(CUDNN_FRONTEND_PREFIX_DIR ${THIRD_PARTY_PATH}/cudnn-frontend)
 set(CUDNN_FRONTEND_SOURCE_DIR
    ${THIRD_PARTY_PATH}/cudnn-frontend/src/extern_cudnn_frontend/include)
@@ -55,7 +55,7 @@ include_directories(${CUDNN_FRONTEND_INCLUDE_DIR})
 message(
  STATUS
-    "Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_VER}. Directory: ${DIRENT_DOWNLOAD_DIR}"
+    "Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_VER}. Directory: ${CUDNN_FRONTEND_DOWNLOAD_DIR}"
 )
 function(download_cudnn_frontend)
@@ -99,9 +99,7 @@ ExternalProject_Add(
  DOWNLOAD_DIR ${CUDNN_FRONTEND_DOWNLOAD_DIR}
  DOWNLOAD_NO_PROGRESS 1
  UPDATE_COMMAND ""
-  PATCH_COMMAND
+  PATCH_COMMAND ""
-    patch -d ${CUDNN_FRONTEND_SOURCE_DIR} -p2 <
-    ${PADDLE_SOURCE_DIR}/patches/cudnn-frontend/0001-patch-for-paddle.patch
  CONFIGURE_COMMAND ""
  BUILD_COMMAND ""
  INSTALL_COMMAND ""

--- a/paddle/phi/kernels/autotune/cache_cudnn_frontend.h
+++ b/paddle/phi/kernels/autotune/cache_cudnn_frontend.h
@@ -17,6 +17,7 @@
 #include <map>
 #include <mutex>
 #include <string>
+#include <thread>
 #include <vector>
 #include "paddle/phi/backends/dynload/cudnn_frontend.h"
@@ -34,7 +35,13 @@ class CudnnFrontendPlanCache {
    saturation_count_ = FLAGS_cudnn_cache_saturation_count;
  }
-  int64_t Size() const { return map_.size(); }
+  int64_t Size() const {
+    int64_t total_size = 0;
+    for (auto it = map_.begin(); it != map_.end(); it++) {
+      total_size += (it->second).size();
+    }
+    return total_size;
+  }
  int64_t CacheHits() const { return cache_hits_; }
@@ -58,11 +65,12 @@ class CudnnFrontendPlanCache {
    cache_misses_ = 0;
  }
-  bool FindPlan(const cudnn_frontend::OperationGraph& op_graph,
+  bool FindPlan(const cudnn_frontend::feature_vector_t &feature,
-                bool use_addto = false) {
+                cudnnHandle_t handle) {
    bool ret = false;
    std::lock_guard<std::mutex> lock(*cache_mutex_);
-    if (map_.count(MakeKey(op_graph, use_addto)) > 0) {
+    auto &local_map = map_[hasher(std::this_thread::get_id())];
+    if (local_map.count(GetExtendedFeature(feature, handle)) > 0) {
      cache_hits_++;
      ret = true;
    } else {
@@ -71,58 +79,98 @@ class CudnnFrontendPlanCache {
    return ret;
  }
-  cudnn_frontend::ManagedOpaqueDescriptor GetConfig(
+  void GetPlan(const cudnn_frontend::feature_vector_t &feature,
-      const cudnn_frontend::OperationGraph& op_graph,
+               const cudnn_frontend::ExecutionPlan **plan,
-      cudnnHandle_t handle,
+               int64_t *workspace_size,
-      bool use_addto = false) {
+               cudnnHandle_t handle) {
+    // Note(tizheng): CUDNNv8 execution plan is not thread-safe.
+    // A shared plan being executed by different threads is
+    // generally not safe (for now).
    std::lock_guard<std::mutex> lock(*cache_mutex_);
-    auto engine_config = map_[MakeKey(op_graph, use_addto)];
+    auto &local_map = map_[hasher(std::this_thread::get_id())];
-    return engine_config;
+    auto it = local_map.find(GetExtendedFeature(feature, handle));
+    if (it == local_map.end()) {
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "[cudnn_frontend] Cached Plan Not Found."));
+      return;
+    }
+    *plan = &(it->second);
+    *workspace_size = (*plan)->getWorkspaceSize();
+    VLOG(4) << "Cached execution plan found." << (*plan)->getTag()
+            << "; Require workspace: " << *workspace_size;
  }
-  void InsertPlan(const cudnn_frontend::OperationGraph& op_graph,
+  void InsertPlan(const cudnn_frontend::feature_vector_t &feature,
-                  const cudnn_frontend::ExecutionPlan& plan,
+                  const cudnn_frontend::ExecutionPlan &plan,
-                  bool use_addto = false) {
+                  cudnnHandle_t handle) {
-    VLOG(4) << "[cudnn_frontend] cache: Insert graph tag: "
+    VLOG(4) << "[cudnn_frontend] cache: Insert plan: " << plan.getTag();
-            << op_graph.getTag();
    std::lock_guard<std::mutex> lock(*cache_mutex_);
-    map_.insert(
+    auto &local_map = map_[hasher(std::this_thread::get_id())];
-        std::make_pair(MakeKey(op_graph, use_addto), plan.GetEngineConfig()));
+    local_map.insert(std::make_pair(GetExtendedFeature(feature, handle), plan));
  }
-  bool IsStable(const cudnn_frontend::OperationGraph& op_graph,
+  bool IsStable(const cudnn_frontend::feature_vector_t &feature,
-                const std::string& tag,
+                const std::string &tag,
-                bool use_addto = false) {
+                cudnnHandle_t handle) {
    if (saturation_count_ == 1) {
      return true;
    }
    std::lock_guard<std::mutex> lock(*cache_mutex_);
-    if (map_.count(MakeKey(op_graph, use_addto))) {
+    auto &local_map = map_[hasher(std::this_thread::get_id())];
+    auto &local_tracker = tracker_[hasher(std::this_thread::get_id())];
+    auto ext_feature = GetExtendedFeature(feature, handle);
+    if (local_map.count(ext_feature)) {
      return false;
    }
-    int cnt = tracker_[std::make_pair(MakeKey(op_graph, use_addto), tag)] += 1;
+    int cnt = local_tracker[std::make_pair(ext_feature, tag)] += 1;
-    VLOG(4) << "[cudnn_frontend] SaturationTracker: " << op_graph.getTag()
+    VLOG(4) << "[cudnn_frontend] SaturationTracker: " << tag << " " << cnt;
-            << " " << tag << " " << cnt;
    return cnt >= saturation_count_;
  }
+  bool FindPlan(const cudnn_frontend::OperationGraph &op_graph,
+                cudnnHandle_t handle) {
+    return FindPlan(op_graph.getFeatureVector(), handle);
+  }
+  void GetPlan(const cudnn_frontend::OperationGraph &op_graph,
+               const cudnn_frontend::ExecutionPlan **plan,
+               int64_t *workspace_size,
+               cudnnHandle_t handle) {
+    GetPlan(op_graph.getFeatureVector(), plan, workspace_size, handle);
+  }
+  void InsertPlan(const cudnn_frontend::OperationGraph &op_graph,
+                  const cudnn_frontend::ExecutionPlan &plan,
+                  cudnnHandle_t handle) {
+    InsertPlan(op_graph.getFeatureVector(), plan, handle);
+  }
+  bool IsStable(const cudnn_frontend::OperationGraph &op_graph,
+                const std::string &tag,
+                cudnnHandle_t handle) {
+    return IsStable(op_graph.getFeatureVector(), tag, handle);
+  }
 private:
-  static cudnn_frontend::feature_vector_t MakeKey(
+  cudnn_frontend::feature_vector_t GetExtendedFeature(
-      const cudnn_frontend::OperationGraph& op_graph, bool use_addto) {
+      cudnn_frontend::feature_vector_t feat, cudnnHandle_t handle) {
-    auto key = op_graph.getFeatureVector();
+    int64_t val = 0;
-    key.push_back(static_cast<uint64_t>(use_addto));
+    memcpy(&val, &handle, sizeof(int64_t));
-    return key;
+    feat.push_back(val);
+    return feat;
  }
+  using FeatureVectorToPlanMap =
+      std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan>;
+  std::map<std::size_t, FeatureVectorToPlanMap> map_;
+  std::hash<std::thread::id> hasher;
-  std::map<cudnn_frontend::feature_vector_t,
-           cudnn_frontend::ManagedOpaqueDescriptor>
-      map_;
  std::shared_ptr<std::mutex> cache_mutex_;
  int saturation_count_;
  using SaturationTracker =
      std::map<std::pair<cudnn_frontend::feature_vector_t, std::string>, int>;
-  SaturationTracker tracker_;
+  std::map<std::size_t, SaturationTracker> tracker_;
  int64_t cache_hits_{0};
  int64_t cache_misses_{0};

--- a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/autotune/cache.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
 namespace phi {
@@ -102,6 +103,33 @@ class CudnnFrontendConvHelper {
        .build();
  }
+  static inline cudnn_frontend::Tensor GetGeneralTensorDescriptor(
+      std::vector<int64_t> dims,
+      cudnnTensorFormat_t layout,
+      int64_t id,
+      int64_t alignment,
+      cudnnDataType_t dtype,
+      bool is_virtual = false,
+      int64_t group_count = 0) {
+    std::vector<int64_t> strides = GenerateStrides(dims, layout);
+    if (group_count > 0) {
+      int64_t c_per_group = dims[1];
+      int64_t c_stride = strides[1];
+      dims.insert(dims.begin() + 1, group_count);
+      strides.insert(strides.begin() + 1, c_stride * c_per_group);
+    }
+    cudnn_frontend::TensorBuilder builder;
+    builder.setDim(dims.size(), dims.data())
+        .setStride(strides.size(), strides.data())
+        .setId(id)
+        .setAlignment(alignment)
+        .setDataType(dtype);
+    if (is_virtual) {
+      builder.setVirtual();
+    }
+    return builder.build();
+  }
  static cudnn_frontend::ConvDesc_v8 GetConvDescriptor(
      cudnnDataType_t dataType,
      const std::vector<int>& padding,
@@ -157,44 +185,26 @@ class CudnnFrontendConvHelper {
      cudnn_frontend::OperationGraph* op_graph_pointer,
      bool exhaustive_search,
      bool deterministic,
-      void* x_data,
+      std::vector<void*>* data_ptrs,
-      void* y_data,
+      std::vector<int64_t>* uids,
-      void* w_data,
      cudnnHandle_t handle,
      phi::DnnWorkspaceHandle* workspace_handle) {
    auto heurgen_method = [=](cudnn_frontend::OperationGraph& op_graph_)
        -> cudnn_frontend::EngineConfigList {
-      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
-                            .setOperationGraph(op_graph_)
-                            .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
-                            .build();
-      VLOG(4) << "Heuristic has " << heuristics.getEngineConfigCount()
-              << " configurations ";
-      auto& engine_configs =
-          heuristics.getEngineConfig(heuristics.getEngineConfigCount());
-      cudnn_frontend::EngineConfigList filtered_configs;
-      cudnn_frontend::filter(engine_configs,
-                             filtered_configs,
-                             deterministic ? IsNonDeterministic : AllowAll);
-      return filtered_configs;
-    };
-    auto fallback_method = [=](cudnn_frontend::OperationGraph& op_graph_)
-        -> cudnn_frontend::EngineConfigList {
-      auto fallback = cudnn_frontend::EngineFallbackListBuilder()
-                          .setOperationGraph(op_graph_)
-                          .build();
-      auto& fallback_list = fallback.getFallbackList();
      cudnn_frontend::EngineConfigList filtered_configs;
-      cudnn_frontend::filter(fallback_list,
+      auto statuses = cudnn_frontend::get_heuristics_list<2>(
+          {"heuristics_instant", "heuristics_fallback"},
+          op_graph_,
+          deterministic ? IsNonDeterministic : AllowAll,
          filtered_configs,
-                             deterministic ? IsNonDeterministic : AllowAll);
+          true);
+      VLOG(6) << "Filter config list has " << filtered_configs.size()
+              << " configurations ";
      return filtered_configs;
    };
-    std::array<cudnn_frontend::GeneratorSource const, 2> sources = {
+    std::array<cudnn_frontend::GeneratorSource const, 1> sources = {
-        heurgen_method, fallback_method};
+        heurgen_method};
    cudnn_frontend::EngineConfigGenerator generator(sources.size(),
                                                    sources.data());
@@ -204,29 +214,18 @@ class CudnnFrontendConvHelper {
        [=](cudnn_frontend::ExecutionPlan const& plan) -> bool {
      return plan.getWorkspaceSize() > workspace_size_limit;
    };
+    VLOG(6) << "[cudnn_frontend] Max workspace size: " << workspace_size_limit;
-    auto plans =
+    cudnn_frontend::executionPlans_t plans;
-        generator.cudnnGetPlan(handle, *op_graph_pointer, predicate_function);
    bool use_autotune = phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
    if (!deterministic && (exhaustive_search || use_autotune)) {
-      size_t workspace_size_max = 0;
-      std::for_each(
-          plans.begin(), plans.end(), [&](cudnn_frontend::ExecutionPlan& opt) {
-            if (opt.getWorkspaceSize() > workspace_size_max) {
-              workspace_size_max = opt.getWorkspaceSize();
-            }
-          });
-      VLOG(6) << "[cudnn_frontend] Max workspace size: " << workspace_size_max;
      workspace_handle->RunFunc(
          [&](void* workspace_ptr) {
-            void* data_ptrs[] = {x_data, y_data, w_data};
+            auto variant_pack =
-            int64_t uids[] = {'x', 'y', 'w'};
+                cudnn_frontend::VariantPackBuilder()
-            auto variant_pack = cudnn_frontend::VariantPackBuilder()
                    .setWorkspacePointer(workspace_ptr)
-                                    .setDataPointers(3, data_ptrs)
+                    .setDataPointers(data_ptrs->size(), data_ptrs->data())
-                                    .setUids(3, uids)
+                    .setUids(uids->size(), uids->data())
                    .build();
            plans =
                generator
@@ -237,7 +236,10 @@ class CudnnFrontendConvHelper {
                        variant_pack,
                        predicate_function);
          },
-          workspace_size_max);
+          workspace_size_limit);
+    } else {
+      plans =
+          generator.cudnnGetPlan(handle, *op_graph_pointer, predicate_function);
    }
    std::for_each(
@@ -249,6 +251,146 @@ class CudnnFrontendConvHelper {
    return plans;
  }
+  static cudnn_frontend::executionPlans_t FindExecutionPlans(
+      cudnn_frontend::OperationGraph* op_graph_pointer,
+      bool exhaustive_search,
+      bool deterministic,
+      void* x_data,
+      void* y_data,
+      void* w_data,
+      cudnnHandle_t handle,
+      phi::DnnWorkspaceHandle* workspace_handle) {
+    std::vector<void*> data_ptrs({x_data, y_data, w_data});
+    std::vector<int64_t> uids({'x', 'y', 'w'});
+    return FindExecutionPlans(op_graph_pointer,
+                              exhaustive_search,
+                              deterministic,
+                              &data_ptrs,
+                              &uids,
+                              handle,
+                              workspace_handle);
+  }
+  static void ExecutePlan(cudnnHandle_t handle_,
+                          phi::DnnWorkspaceHandle* workspace_handle,
+                          std::vector<void*>* data_ptrs,
+                          std::vector<int64_t>* uids,
+                          cudnnBackendDescriptor_t plan_desc,
+                          int64_t workspace_size) {
+    workspace_handle->RunFunc(
+        [&](void* workspace_ptr) {
+          auto variant_pack =
+              cudnn_frontend::VariantPackBuilder()
+                  .setWorkspacePointer(workspace_ptr)
+                  .setDataPointers(data_ptrs->size(), data_ptrs->data())
+                  .setUids(uids->size(), uids->data())
+                  .build();
+          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
+              handle_, plan_desc, variant_pack.get_raw_desc()));
+        },
+        workspace_size);
+  }
+  static void ExecutePlan(cudnnHandle_t handle_,
+                          phi::DnnWorkspaceHandle* workspace_handle,
+                          void* x_data,
+                          void* y_data,
+                          void* w_data,
+                          cudnnBackendDescriptor_t plan_desc,
+                          int64_t workspace_size) {
+    std::vector<void*> data_ptrs({x_data, y_data, w_data});
+    std::vector<int64_t> uids({'x', 'y', 'w'});
+    ExecutePlan(handle_,
+                workspace_handle,
+                &data_ptrs,
+                &uids,
+                plan_desc,
+                workspace_size);
+  }
+  static void ExecutePlansAndCache(
+      cudnnHandle_t handle_,
+      phi::DnnWorkspaceHandle* workspace_handle,
+      std::vector<void*>* data_ptrs,
+      std::vector<int64_t>* uids,
+      cudnn_frontend::executionPlans_t* plans,
+      bool exhaustive_search,
+      const cudnn_frontend::feature_vector_t& feature_vector,
+      phi::autotune::CudnnFrontendPlanCache* plan_cache) {
+    for (auto& plan : *plans) {
+      try {
+        ExecutePlan(handle_,
+                    workspace_handle,
+                    data_ptrs,
+                    uids,
+                    plan.get_raw_desc(),
+                    plan.getWorkspaceSize());
+        if (!exhaustive_search ||
+            plan_cache->IsStable(feature_vector, plan.getTag(), handle_)) {
+          plan_cache->InsertPlan(feature_vector, plan, handle_);
+        }
+        return;
+      } catch (cudnn_frontend::cudnnException& e) {
+        VLOG(4) << "Plan " << plan.describe()
+                << "failed to execute. Trying next plan.";
+      } catch (phi::enforce::EnforceNotMet& e) {
+        VLOG(4) << "Plan " << plan.describe()
+                << "failed to execute. Trying next plan.";
+      }
+    }
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "[CUDNN Frontend API] No valid plan could "
+        "be found to execute. Try setting FLAGS_conv_workspace_size_limit "
+        "higher."));
+  }
+  static void ExecutePlansAndCache(
+      cudnnHandle_t handle_,
+      phi::DnnWorkspaceHandle* workspace_handle,
+      void* x_data,
+      void* y_data,
+      void* w_data,
+      cudnn_frontend::executionPlans_t* plans,
+      bool exhaustive_search,
+      const cudnn_frontend::OperationGraph& op_graph,
+      phi::autotune::CudnnFrontendPlanCache* plan_cache) {
+    std::vector<void*> data_ptrs({x_data, y_data, w_data});
+    std::vector<int64_t> uids({'x', 'y', 'w'});
+    ExecutePlansAndCache(handle_,
+                         workspace_handle,
+                         &data_ptrs,
+                         &uids,
+                         plans,
+                         exhaustive_search,
+                         op_graph.getFeatureVector(),
+                         plan_cache);
+  }
+  static cudnn_frontend::Operation MakePointwiseOp(
+      cudnnPointwiseMode_t mode,
+      cudnnDataType_t dtype,
+      cudnn_frontend::Tensor const& x_desc,
+      cudnn_frontend::Tensor const& b_desc,
+      cudnn_frontend::Tensor const& y_desc,
+      float alpha1 = 1.0,
+      float alpha2 = 1.0) {
+    auto op_desc = cudnn_frontend::PointWiseDescBuilder()
+                       .setMode(mode)
+                       .setComputeType(dtype)
+                       .build();
+    auto op = cudnn_frontend::OperationBuilder(
+                  CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                  .setxDesc(x_desc)
+                  .setbDesc(b_desc)
+                  .setyDesc(y_desc)
+                  .setpwDesc(op_desc)
+                  .setAlpha(alpha1)
+                  .setAlpha2(alpha2)
+                  .build();
+    VLOG(6) << op.describe();
+    return op;
+  }
 };  // class CudnnFrontendConvHelper
 template <typename T>
@@ -290,28 +432,17 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor,
      alpha,
      beta);
-  if (plan_cache_bwd_data.FindPlan(op_graph, use_addto)) {
+  if (plan_cache_bwd_data.FindPlan(op_graph, handle)) {
-    auto engine_config =
+    const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
-        plan_cache_bwd_data.GetConfig(op_graph, handle, use_addto);
+    int64_t workspace_size = 0;
-    auto cached_plan = cudnn_frontend::ExecutionPlanBuilder()
+    plan_cache_bwd_data.GetPlan(
-                           .setHandle(handle)
+        op_graph, &cached_plan, &workspace_size, handle);
-                           .setEngineConfig(engine_config, op_graph.getTag())
+    helper::ExecutePlan(handle,
-                           .build();
+                        workspace_handle,
-    auto workspace_size = cached_plan.getWorkspaceSize();
+                        dx_tensor_data,
-    VLOG(4) << "Cached execution plan found." << cached_plan.getTag()
+                        dy_tensor_data,
-            << "; Require workspace: " << workspace_size;
+                        w_tensor_data,
-    workspace_handle->RunFunc(
+                        cached_plan->get_raw_desc(),
-        [&](void* workspace_ptr) {
-          void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data};
-          int64_t uids[] = {'x', 'y', 'w'};
-          auto variant_pack = cudnn_frontend::VariantPackBuilder()
-                                  .setWorkspacePointer(workspace_ptr)
-                                  .setDataPointers(3, data_ptrs)
-                                  .setUids(3, uids)
-                                  .build();
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
-              handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc()));
-        },
                        workspace_size);
    return;
  }
@@ -325,34 +456,15 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor,
                                          handle,
                                          workspace_handle);
-  for (auto& plan : plans) {
+  helper::ExecutePlansAndCache(handle,
-    try {
+                               workspace_handle,
-      int64_t workspace_size = plan.getWorkspaceSize();
+                               dx_tensor_data,
-      workspace_handle->RunFunc(
+                               dy_tensor_data,
-          [&](void* workspace_ptr) {
+                               w_tensor_data,
-            void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data};
+                               &plans,
-            int64_t uids[] = {'x', 'y', 'w'};
+                               exhaustive_search,
-            auto variant_pack = cudnn_frontend::VariantPackBuilder()
+                               op_graph,
-                                    .setWorkspacePointer(workspace_ptr)
+                               &plan_cache_bwd_data);
-                                    .setDataPointers(3, data_ptrs)
-                                    .setUids(3, uids)
-                                    .build();
-            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
-                handle, plan.get_raw_desc(), variant_pack.get_raw_desc()));
-          },
-          workspace_size);
-      if (!exhaustive_search ||
-          plan_cache_bwd_data.IsStable(op_graph, plan.getTag(), use_addto)) {
-        plan_cache_bwd_data.InsertPlan(op_graph, plan, use_addto);
-      }
-      return;
-    } catch (cudnn_frontend::cudnnException& e) {
-    } catch (phi::enforce::EnforceNotMet& e) {
-    }
-  }
-  PADDLE_THROW(
-      phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could "
-                                   "be found to execute conv backward data."));
 }
 template <typename T>
@@ -394,27 +506,17 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor,
      alpha,
      beta);
-  if (plan_cache_bwd_filter.FindPlan(op_graph)) {
+  if (plan_cache_bwd_filter.FindPlan(op_graph, handle)) {
-    auto engine_config = plan_cache_bwd_filter.GetConfig(op_graph, handle);
+    const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
-    auto cached_plan = cudnn_frontend::ExecutionPlanBuilder()
+    int64_t workspace_size = 0;
-                           .setHandle(handle)
+    plan_cache_bwd_filter.GetPlan(
-                           .setEngineConfig(engine_config, op_graph.getTag())
+        op_graph, &cached_plan, &workspace_size, handle);
-                           .build();
+    helper::ExecutePlan(handle,
-    auto workspace_size = cached_plan.getWorkspaceSize();
+                        workspace_handle,
-    VLOG(4) << "Cached execution plan found." << cached_plan.getTag()
+                        x_tensor_data,
-            << "; Require workspace: " << workspace_size;
+                        dy_tensor_data,
-    workspace_handle->RunFunc(
+                        dw_tensor_data,
-        [&](void* workspace_ptr) {
+                        cached_plan->get_raw_desc(),
-          void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data};
-          int64_t uids[] = {'x', 'y', 'w'};
-          auto variant_pack = cudnn_frontend::VariantPackBuilder()
-                                  .setWorkspacePointer(workspace_ptr)
-                                  .setDataPointers(3, data_ptrs)
-                                  .setUids(3, uids)
-                                  .build();
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
-              handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc()));
-        },
                        workspace_size);
    return;
  }
@@ -428,39 +530,15 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor,
                                          handle,
                                          workspace_handle);
-  for (auto& plan : plans) {
+  helper::ExecutePlansAndCache(handle,
-    try {
+                               workspace_handle,
-      int64_t workspace_size = plan.getWorkspaceSize();
+                               x_tensor_data,
-      workspace_handle->RunFunc(
+                               dy_tensor_data,
-          [&](void* workspace_ptr) {
+                               dw_tensor_data,
-            void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data};
+                               &plans,
-            int64_t uids[] = {'x', 'y', 'w'};
+                               exhaustive_search,
-            auto variant_pack = cudnn_frontend::VariantPackBuilder()
+                               op_graph,
-                                    .setWorkspacePointer(workspace_ptr)
+                               &plan_cache_bwd_filter);
-                                    .setDataPointers(3, data_ptrs)
-                                    .setUids(3, uids)
-                                    .build();
-            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
-                handle, plan.get_raw_desc(), variant_pack.get_raw_desc()));
-          },
-          workspace_size);
-      if (!exhaustive_search ||
-          plan_cache_bwd_filter.IsStable(op_graph, plan.getTag())) {
-        plan_cache_bwd_filter.InsertPlan(op_graph, plan);
-      }
-      return;
-    } catch (cudnn_frontend::cudnnException& e) {
-      VLOG(4) << "Plan " << plan.describe()
-              << "failed to execute. Trying next plan.";
-    } catch (phi::enforce::EnforceNotMet& e) {
-      VLOG(4) << "Plan " << plan.describe()
-              << "failed to execute. Trying next plan.";
-    }
-  }
-  PADDLE_THROW(phi::errors::InvalidArgument(
-      "[CUDNN Frontend API] No valid plan could "
-      "be found to execute conv backward filter."));
 }
 }  // namespace phi
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -261,27 +261,16 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
      alpha,
      beta);
-  if (plan_cache.FindPlan(op_graph)) {
+  if (plan_cache.FindPlan(op_graph, handle)) {
-    auto engine_config = plan_cache.GetConfig(op_graph, handle);
+    const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
-    auto cached_plan = cudnn_frontend::ExecutionPlanBuilder()
+    int64_t workspace_size = 0;
-                           .setHandle(handle)
+    plan_cache.GetPlan(op_graph, &cached_plan, &workspace_size, handle);
-                           .setEngineConfig(engine_config, op_graph.getTag())
+    helper::ExecutePlan(handle,
-                           .build();
+                        &workspace_handle,
-    auto workspace_size = cached_plan.getWorkspaceSize();
+                        input_data,
-    VLOG(4) << "Cached execution plan found." << cached_plan.getTag()
+                        output_data,
-            << "; Require workspace: " << workspace_size;
+                        filter_data,
-    workspace_handle.RunFunc(
+                        cached_plan->get_raw_desc(),
-        [&](void* workspace_ptr) {
-          void* data_ptrs[] = {input_data, output_data, filter_data};
-          int64_t uids[] = {'x', 'y', 'w'};
-          auto variant_pack = cudnn_frontend::VariantPackBuilder()
-                                  .setWorkspacePointer(workspace_ptr)
-                                  .setDataPointers(3, data_ptrs)
-                                  .setUids(3, uids)
-                                  .build();
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
-              handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc()));
-        },
                        workspace_size);
    return;
  }
@@ -295,37 +284,15 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
                                          handle,
                                          &workspace_handle);
-  for (auto& plan : plans) {
+  helper::ExecutePlansAndCache(handle,
-    try {
+                               &workspace_handle,
-      int64_t workspace_size = plan.getWorkspaceSize();
+                               input_data,
-      workspace_handle.RunFunc(
+                               output_data,
-          [&](void* workspace_ptr) {
+                               filter_data,
-            void* data_ptrs[] = {input_data, output_data, filter_data};
+                               &plans,
-            int64_t uids[] = {'x', 'y', 'w'};
+                               exhaustive_search,
-            auto variant_pack = cudnn_frontend::VariantPackBuilder()
+                               op_graph,
-                                    .setWorkspacePointer(workspace_ptr)
+                               &plan_cache);
-                                    .setDataPointers(3, data_ptrs)
-                                    .setUids(3, uids)
-                                    .build();
-            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
-                handle, plan.get_raw_desc(), variant_pack.get_raw_desc()));
-          },
-          workspace_size);
-      if (!exhaustive_search || plan_cache.IsStable(op_graph, plan.getTag())) {
-        plan_cache.InsertPlan(op_graph, plan);
-      }
-      return;
-    } catch (cudnn_frontend::cudnnException& e) {
-      VLOG(4) << "Plan " << plan.describe()
-              << "failed to execute. Trying next plan.";
-    } catch (phi::enforce::EnforceNotMet& e) {
-      VLOG(4) << "Plan " << plan.describe()
-              << "failed to execute. Trying next plan.";
-    }
-  }
-  PADDLE_THROW(
-      phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could "
-                                   "be found to execute conv."));
 }
 #endif

--- a/patches/cudnn-frontend/0001-patch-for-paddle.patch
+++ b/patches/cudnn-frontend/0001-patch-for-paddle.patch
-From dce3465da518641ee177187fbc0c0d36faea28f2 Mon Sep 17 00:00:00 2001
-From: Tian Zheng <tizheng@nvidia.com>
-Date: Thu, 27 Oct 2022 20:33:16 -0700
-Subject: [PATCH] patch for paddle
---
- include/cudnn_frontend_ExecutionPlan.h      | 10 +++++++---
- include/cudnn_frontend_ExecutionPlanCache.h |  2 +-
- include/cudnn_frontend_OperationGraph.h     |  2 +-
- include/cudnn_frontend_find_plan.h          |  6 +++---
- include/cudnn_frontend_get_plan.h           |  4 ++--
- 5 files changed, 14 insertions(+), 10 deletions(-)
-diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h
-index 7bed4b4..3314b5c 100644
--- a/include/cudnn_frontend_ExecutionPlan.h
-+++ b/include/cudnn_frontend_ExecutionPlan.h
-@@ -167,6 +167,10 @@ class ExecutionPlan_v8 : public BackendDescriptor {
-         return json_string;
- #endif
-     }
-+    
-+    ManagedOpaqueDescriptor GetEngineConfig() const {
-+        return engine_config;
-+    }
-     ExecutionPlan_v8(ExecutionPlan_v8 const &) = default;
-     ExecutionPlan_v8 &
-@@ -182,7 +186,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
-                                  CUDNN_TYPE_NUMERICAL_NOTE,
-                                  CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
-                                  &elem_count,
-                                 NULL);
-+                                 nullptr);
-         numeric_notes_vec.resize(elem_count);
-         status = cudnnBackendGetAttribute(extractedEngine_,
-                                  CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
-@@ -206,7 +210,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
-                                  CUDNN_TYPE_BEHAVIOR_NOTE,
-                                  CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
-                                  &elem_count,
-                                 NULL);
-+                                 nullptr);
-         behavior_notes_vec.resize(elem_count);
-         status = cudnnBackendGetAttribute(extractedEngine_,
-                                  CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
-@@ -310,7 +314,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
-                 CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
-                 CUDNN_TYPE_INT64,
-                 1,
-                NULL,
-+                nullptr,
-                 &workSpaceSize);
-         if (status != CUDNN_STATUS_SUCCESS) {
-             set_error_and_throw_exception(this,
-diff --git a/include/cudnn_frontend_ExecutionPlanCache.h b/include/cudnn_frontend_ExecutionPlanCache.h
-index 99a157c..741c490 100644
--- a/include/cudnn_frontend_ExecutionPlanCache.h
-+++ b/include/cudnn_frontend_ExecutionPlanCache.h
-@@ -94,7 +94,7 @@ class ExecutionPlanCache_v1 {
-     /// String to map of feature_vector to execution plan
-     /// For a given FeatureVector of type T according to the Operation Graph, we get the plan. 
-    using FeatureVectorToPlanMap = std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan, cudnn_frontend::ExecutionPlanCache_v1::compare>;
-+    using FeatureVectorToPlanMap = std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan>;
-     FeatureVectorToPlanMap  cache;
-     mutable std::mutex cache_mutex;
-diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
-index 1478ce8..7894080 100644
--- a/include/cudnn_frontend_OperationGraph.h
-+++ b/include/cudnn_frontend_OperationGraph.h
-@@ -78,7 +78,7 @@ class OperationGraph_v8 : public BackendDescriptor {
-                                                CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT,
-                                                CUDNN_TYPE_INT64,
-                                                1,
-                                               NULL,
-+                                               nullptr,
-                                                &global_count);
-         if (status != CUDNN_STATUS_SUCCESS) {
-             set_error_and_throw_exception(this,
-diff --git a/include/cudnn_frontend_find_plan.h b/include/cudnn_frontend_find_plan.h
-index 02a08a1..5f94e45 100644
--- a/include/cudnn_frontend_find_plan.h
-+++ b/include/cudnn_frontend_find_plan.h
-@@ -53,7 +53,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
-     cudaDeviceSynchronize();
-     cudaStream_t stream = nullptr;
-    ::cudnnGetStream(handle, &stream);
-+    cudnnGetStream(handle, &stream);
-     for (auto &plan : plans) {
-         float time_ms       = 0.0f;
-@@ -61,7 +61,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
-         float min_time_ms   = std::numeric_limits<float>::max();
-         // Warm-up run
-        auto warmup_status = ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
-+        auto warmup_status = cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
-         if (warmup_status != CUDNN_STATUS_SUCCESS) {
-             getLogger() << "[cudnn_frontend] Plan " << plan.getTag() << " failed with " << to_string(warmup_status) << std::endl;
-             continue;
-@@ -71,7 +71,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
-         for (int i = 0; i < maxIterCount; i++) {
-             cudaEventRecord(start, stream);
-            ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
-+            cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
-             cudaEventRecord(stop, stream);
-             cudaEventSynchronize(stop);
-diff --git a/include/cudnn_frontend_get_plan.h b/include/cudnn_frontend_get_plan.h
-index 50535ab..c43eec9 100644
--- a/include/cudnn_frontend_get_plan.h
-+++ b/include/cudnn_frontend_get_plan.h
-@@ -26,7 +26,7 @@
- namespace cudnn_frontend {
-auto
-+inline auto
- EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph)
-     -> executionPlans_t {
-     // Creating a set of execution plans that are supported.
-@@ -47,7 +47,7 @@ EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGra
-     return plans;
- }
-auto
-+inline auto
- EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph, Predicate pred)
-     -> executionPlans_t {
-     // Creating a set of execution plans that are supported.
-- 
-2.25.1
--- a/test/legacy_test/test_switch_autotune.py
+++ b/test/legacy_test/test_switch_autotune.py
@@ -141,6 +141,10 @@ class TestStaticAutoTuneStatus(TestAutoTune):
        exe.run(startup_program)
        x = np.random.random(size=data_shape).astype('float32')
+        # Node(tizheng): warmup run to make sure the following runs
+        # are in the same thread. Necessary for CUDNNv8 tests
+        exe.run(program=main_program, feed={'X': x}, fetch_list=[loss])
        self.set_flags(enable_autotune)
        if enable_autotune:
            config = {"kernel": {"enable": True, "tuning_range": [1, 2]}}