optimize conv algo cache (#41891)

* optimizer conv alog speed * code polish * remove useless code * fix compile error * fix cpu compile error * not use cudnn alog t * add search cache max number * polish code * fix cache test bug * add groups data format to conv args * fix cache test bug * fix cudnn_deterministic bug * fix test switch auto tune bug * fix test swith autotune bug; * fix conv cache bug * fix cache test error * fix cache test bug * fix windows mac compile error * fix workspace search error * update cudnn cache * fix cache test bug; test=develop * fix autotune swith test error * polish code * oplish code

optimize conv algo cache (#41891)
* optimizer conv alog speed * code polish * remove useless code * fix compile error * fix cpu compile error * not use cudnn alog t * add search cache max number * polish code * fix cache test bug * add groups data format to conv args * fix cache test bug * fix cudnn_deterministic bug * fix test switch auto tune bug * fix test swith autotune bug; * fix conv cache bug * fix cache test error * fix cache test bug * fix windows mac compile error * fix workspace search error * update cudnn cache * fix cache test bug; test=develop * fix autotune swith test error * polish code * oplish code
1cd7e68b · hong · GitHub · f2f3f6e7 · 1cd7e68b · 1cd7e68b
12 changed file
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -44,7 +44,13 @@ struct SearchAlgorithm {};
 template <typename AlgoT>
 struct SearchResult {
  SearchResult() {}
+  explicit SearchResult(const phi::autotune::DnnNode& node)
+      : algo(static_cast<AlgoT>(node.algo)),
+        workspace_size(node.workspace_size) {}
  explicit SearchResult(AlgoT a) : algo(a) {}
+  explicit SearchResult(AlgoT a, float t, size_t size)
+      : algo(a), time(t), workspace_size(size) {}
  AlgoT algo = static_cast<AlgoT>(0);
  float time = -1.f;
@@ -76,28 +82,50 @@ struct ConvArgsBase {
  // dilations
  std::vector<int> d;
+  // groups
+  int group;
+  // data foramt
+  DataLayout data_layout;
  ConvArgsBase(const framework::Tensor* x,
               const framework::Tensor* w,
               const framework::Tensor* o,
               const std::vector<int> s,
               const std::vector<int> p,
               const std::vector<int> d,
-               DataT dtype)
+               DataT dtype,
-      : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
+               int g,
+               DataLayout layout)
+      : x(x),
+        w(w),
+        o(o),
+        s(s),
+        p(p),
+        d(d),
+        cudnn_dtype(dtype),
+        group(g),
+        data_layout(layout) {}
  template <typename T>
-  size_t GetCacheKey() const {
+  phi::autotune::ConvCacheKey Convert2ConvCacheKey() const {
    auto x_shape = phi::vectorize(x->dims());
    auto w_shape = phi::vectorize(w->dims());
    VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
-             << ", strides=" << s << ", paddings=" << p << ", dilations=" << d;
+             << ", strides=" << s << ", paddings=" << p << ", dilations=" << d
-    return phi::autotune::ConvKey(
+             << ",data= " << paddle::experimental::CppTypeToDataType<T>::Type()
+             << ", group=" << group
+             << ", data layout=" << static_cast<int64_t>(data_layout);
+    return phi::autotune::ConvCacheKey(
        x_shape,
        w_shape,
        p,
        s,
        d,
-        paddle::experimental::CppTypeToDataType<T>::Type());
+        paddle::experimental::CppTypeToDataType<T>::Type(),
+        group,
+        static_cast<int64_t>(data_layout));
  }
 };

--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -191,32 +191,36 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
    SetConvMathType(ctx, dtype, args.cdesc);
    if (deterministic) {
-      result = FindAlgoDeterministic();
+      result = FindAlgoDeterministic(args);
    } else {
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
      // 2. Once turning on auto-tune, runn heuristic search(default) before
      //    auto-tune process, run exhaustive_search during mentioned process.
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
+      auto key = args.Convert2ConvCacheKey<T>();
      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
      if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
      } else {
        bool use_autotune =
            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
        if (exhaustive_search || use_autotune) {
          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
        } else {
          result = FindAlgoHeuristic(args, ctx);
        }
+        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
+                                    result.workspace_size);
+        cache.Set(key, node);
      }
    }
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
+            << ", choose algo=" << result.algo
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
    return result;
  }
@@ -236,8 +240,9 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
  }
 private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
-    return SearchResult<AlgoT>(static_cast<AlgoT>(1));
+    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
+    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
  }
  // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
@@ -298,6 +303,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
            workspace_size_limit,
            &(result.algo)));
 #endif
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
    return result;
  }
@@ -343,6 +349,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
    return result;
  }
@@ -394,33 +401,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
    SetConvMathType(ctx, dtype, args.cdesc);
    if (deterministic) {
-      result = FindAlgoDeterministic();
+      result = FindAlgoDeterministic(args);
    } else {
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
      // 2. Once turning on auto-tune, runn heuristic search(default) before
      //    auto-tune process, run exhaustive_search during mentioned process.
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
+      auto key = args.Convert2ConvCacheKey<T>();
      auto& cache =
          phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
      if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
      } else {
        bool use_autotune =
            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
        if (exhaustive_search || use_autotune) {
          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
        } else {
          result = FindAlgoHeuristic(args, ctx);
        }
+        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
+                                    result.workspace_size);
+        cache.Set(key, node);
      }
    }
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
+            << ", choose algo=" << result.algo
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
    return result;
  }
@@ -440,8 +451,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
  }
 private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
-    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
+    auto workspace_size =
+        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
+    return SearchResult<AlgoT>(
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
  }
  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -513,7 +527,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
            workspace_size_limit,
            &(result.algo)));
 #endif
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
    return result;
  }
@@ -559,6 +573,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
    return result;
  }
@@ -609,33 +624,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
    SetConvMathType(ctx, dtype, args.cdesc);
    if (deterministic) {
-      result = FindAlgoDeterministic();
+      result = FindAlgoDeterministic(args);
    } else {
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
      // 2. Once turning on auto-tune, runn heuristic search(default) before
      //    auto-tune process, run exhaustive_search during mentioned process.
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
+      auto key = args.Convert2ConvCacheKey<T>();
      auto& cache =
          phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
      if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
      } else {
        bool use_autotune =
            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
        if (exhaustive_search || use_autotune) {
          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
        } else {
          result = FindAlgoHeuristic(args, ctx);
        }
+        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
+                                    result.workspace_size);
+        cache.Set(key, node);
      }
    }
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
+            << ", choose algo=" << result.algo
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
    return result;
  }
@@ -656,8 +675,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
  }
 private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
-    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
+    auto workspace_size =
+        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
+    return SearchResult<AlgoT>(
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
  }
  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -718,6 +740,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
            &(result.algo)));
 #endif
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
    return result;
  }
@@ -786,6 +809,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
      ChooseAlgo(perf_results, workspace_size_limit, &result);
    }
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
    return result;
  }

--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -984,6 +984,17 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 */
 PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
+/**
+ * Conv Search cache max number related FLAG
+ * Name: FLAGS_search_cache_max_number
+ * Since Version: 2.3.0
+ * Value Range: int32, default=1000000
+ * Example:
+ */
+PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
+                             1000000,
+                             "search_cache_max_number.");
 /**
 * Preformance related FLAG
 * Name: einsum_opt

--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -21,21 +21,6 @@
 namespace phi {
 namespace autotune {
-// Define the cache key of operator
-size_t ConvKey(const std::vector<int64_t>& x_dims,
-               const std::vector<int64_t>& w_dims,
-               const std::vector<int>& strides,
-               const std::vector<int>& paddings,
-               const std::vector<int>& dilations,
-               phi::DataType dtype) {
-  return GetKey(x_dims,
-                w_dims,
-                strides,
-                paddings,
-                dilations,
-                static_cast<int64_t>(dtype));
-}
 size_t TransposeKey(const std::vector<int64_t>& x_dims,
                    const std::vector<int32_t>& perm,
                    phi::DataType dtype) {
@@ -73,6 +58,19 @@ void AutoTuneCache::UpdateStatus() {
    cache_hits += v.second.CacheHits();
    cache_misses += v.second.CacheMisses();
  }
+  for (auto& v : cudnn_auto_tune_map_) {
+    VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width)
+            << AlgorithmTypeString(v.first)
+            << " Cache Size: " << v.second.Size()
+            << " Hits: " << v.second.CacheHits()
+            << " Misses: " << v.second.CacheMisses()
+            << " Hit Rate: " << v.second.CacheHitRate();
+    size += v.second.Size();
+    cache_hits += v.second.CacheHits();
+    cache_misses += v.second.CacheMisses();
+  }
  total_size_ = size;
  total_cache_hits_ = cache_hits;
  total_cache_misses_ = cache_misses;

--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -24,6 +24,8 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
+DECLARE_int32(search_cache_max_number);
 inline void HashCombine(std::size_t* seed) {}
 // combine hash value
@@ -32,6 +34,7 @@ template <typename T, typename... Rest>
 inline void HashCombine(std::size_t* seed, const T& v, Rest... rest) {
  std::hash<T> hasher;
  *seed ^= hasher(v) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
+  *seed *= 0x00000100000001B3;
  HashCombine(seed, rest...);
 }
@@ -41,7 +44,7 @@ namespace std {
 template <typename T>
 struct hash<std::vector<T>> {
  std::size_t operator()(std::vector<T> const& vec) const noexcept {
-    std::size_t seed = 0;
+    std::size_t seed = 0xcbf29ce484222325;
    for (auto val : vec) {
      HashCombine(&seed, val);
    }
@@ -53,6 +56,14 @@ struct hash<std::vector<T>> {
 namespace phi {
 namespace autotune {
+struct DnnNode {
+  DnnNode() {}
+  explicit DnnNode(int64_t a, size_t size) : algo(a), workspace_size(size) {}
+  int64_t algo;
+  size_t workspace_size = 0;
+};
 template <typename... Args>
 size_t GetKey(Args&&... args) {
  size_t seed = 0;
@@ -60,13 +71,130 @@ size_t GetKey(Args&&... args) {
  return seed;
 }
-// Define the cache key of operator
+struct ConvCacheKey {
-size_t ConvKey(const std::vector<int64_t>& x_dims,
+  ConvCacheKey() {}
-               const std::vector<int64_t>& w_dims,
+  explicit ConvCacheKey(const std::vector<int64_t>& x_dims,
-               const std::vector<int>& strides,
+                        const std::vector<int64_t>& w_dims,
-               const std::vector<int>& paddings,
+                        const std::vector<int>& strides,
-               const std::vector<int>& dilations,
+                        const std::vector<int>& paddings,
-               phi::DataType dtype);
+                        const std::vector<int>& dilations,
+                        phi::DataType dtype,
+                        int groups,
+                        int64_t data_layout)
+      : x_dims_(x_dims),
+        w_dims_(w_dims),
+        strides_(strides),
+        paddings_(paddings),
+        dilations_(dilations),
+        dtype_(dtype),
+        groups_(groups),
+        data_layout_(data_layout) {}
+  size_t hash_value() const {
+    return GetKey(x_dims_,
+                  w_dims_,
+                  strides_,
+                  paddings_,
+                  dilations_,
+                  static_cast<int64_t>(dtype_),
+                  groups_,
+                  data_layout_);
+  }
+  std::vector<int64_t> x_dims_;
+  std::vector<int64_t> w_dims_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> dilations_;
+  phi::DataType dtype_;
+  int groups_;
+  int64_t data_layout_;
+};
+struct ConvCacheKeyHash {
+  size_t operator()(const ConvCacheKey& cache) const {
+    return cache.hash_value();
+  }
+};
+struct ConvCacheKeyEqual {
+  size_t operator()(const ConvCacheKey& first,
+                    const ConvCacheKey& second) const {
+    if (first.x_dims_ != second.x_dims_) return false;
+    if (first.w_dims_ != second.w_dims_) return false;
+    if (first.strides_ != second.strides_) return false;
+    if (first.paddings_ != second.paddings_) return false;
+    if (first.dilations_ != second.dilations_) return false;
+    if (first.dtype_ != second.dtype_) return false;
+    if (first.groups_ != second.groups_) return false;
+    if (first.data_layout_ != second.data_layout_) return false;
+    return true;
+  }
+};
+class CudnnAlgorithmsCacheMap {
+ public:
+  CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); }
+  DnnNode Get(const ConvCacheKey& key) {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    PADDLE_ENFORCE_NE(
+        hash_.find(key),
+        hash_.end(),
+        phi::errors::PreconditionNotMet("The key does not exist."));
+    return hash_[key];
+  }
+  bool Find(const ConvCacheKey& key) {
+    bool ret = false;
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    if (hash_.find(key) != hash_.end()) {
+      cache_hits_++;
+      ret = true;
+    } else {
+      cache_misses_++;
+    }
+    return ret;
+  }
+  void Clean() {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    hash_.clear();
+    cache_hits_ = 0;
+    cache_misses_ = 0;
+  }
+  void Set(const ConvCacheKey& key, DnnNode algo) {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    if (hash_.size() > static_cast<size_t>(FLAGS_search_cache_max_number)) {
+      hash_.clear();
+    }
+    hash_[key] = algo;
+  }
+  int64_t CacheMisses() const { return cache_misses_; }
+  int64_t CacheHits() const { return cache_hits_; }
+  float CacheHitRate() const {
+    int64_t num_accesses = cache_hits_ + cache_misses_;
+    float cache_hit_rate = 0.;
+    if (num_accesses != 0) {
+      cache_hit_rate =
+          static_cast<float>(cache_hits_) / static_cast<float>(num_accesses);
+    }
+    return cache_hit_rate;
+  }
+  int64_t Size() const { return hash_.size(); }
+ private:
+  std::unordered_map<ConvCacheKey, DnnNode, ConvCacheKeyHash, ConvCacheKeyEqual>
+      hash_;
+  std::shared_ptr<std::mutex> cache_mutex_;
+  int64_t cache_hits_{0};
+  int64_t cache_misses_{0};
+};
 size_t TransposeKey(const std::vector<int64_t>& x_dims,
                    const std::vector<int32_t>& perm,
@@ -77,7 +205,7 @@ class AlgorithmsCache {
 public:
  AlgorithmsCache() : cache_mutex_(new std::mutex()) { hash_.clear(); }
-  AlgorithmT Get(size_t key) {
+  AlgorithmT Get(const size_t& key) {
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    PADDLE_ENFORCE_NE(
        hash_.find(key),
@@ -86,7 +214,7 @@ class AlgorithmsCache {
    return hash_[key];
  }
-  bool Find(size_t key) {
+  bool Find(const size_t& key) {
    bool ret = false;
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    if (hash_.find(key) != hash_.end()) {
@@ -105,7 +233,7 @@ class AlgorithmsCache {
    cache_misses_ = 0;
  }
-  void Set(size_t key, AlgorithmT algo) {
+  void Set(const size_t& key, AlgorithmT algo) {
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    hash_[key] = algo;
  }
@@ -143,9 +271,12 @@ enum class AlgorithmType {
 };
 // AlgorithmsConfigKey -> AlgorithmsID
+// (todo. hong) use cudnnConvolutionFwdAlgo_t
 using AlgorithmsCacheMap = AlgorithmsCache<int64_t>;
 // AlgorithmType -> AlgorithmsCache
 using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
+using CudnnAlgorithmsTypeMap =
+    std::unordered_map<int64_t, CudnnAlgorithmsCacheMap>;
 class AutoTuneCache {
 public:
@@ -158,16 +289,19 @@ class AutoTuneCache {
    return auto_tune_map_[static_cast<int64_t>(algo_type)];
  }
-  AlgorithmsCacheMap& GetConvForward() {
+  CudnnAlgorithmsCacheMap& GetConvForward() {
-    return Get(AlgorithmType::kConvForward);
+    return cudnn_auto_tune_map_[static_cast<int64_t>(
+        AlgorithmType::kConvForward)];
  }
-  AlgorithmsCacheMap& GetConvBackwardData() {
+  CudnnAlgorithmsCacheMap& GetConvBackwardData() {
-    return Get(AlgorithmType::kConvBackwardData);
+    return cudnn_auto_tune_map_[static_cast<int64_t>(
+        AlgorithmType::kConvBackwardData)];
  }
-  AlgorithmsCacheMap& GetConvBackwardFilter() {
+  CudnnAlgorithmsCacheMap& GetConvBackwardFilter() {
-    return Get(AlgorithmType::kConvBackwardFilter);
+    return cudnn_auto_tune_map_[static_cast<int64_t>(
+        AlgorithmType::kConvBackwardFilter)];
  }
  AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
@@ -176,6 +310,10 @@ class AutoTuneCache {
    for (auto& v : auto_tune_map_) {
      v.second.Clean();
    }
+    for (auto& v : cudnn_auto_tune_map_) {
+      v.second.Clean();
+    }
  }
  void UpdateStatus();
@@ -206,14 +344,25 @@ class AutoTuneCache {
  void Register(const AlgorithmType& algo_type) {
    std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
-    int64_t key = static_cast<int64_t>(algo_type);
+    if (algo_type == AlgorithmType::kConvForward ||
-    if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+        algo_type == AlgorithmType::kConvBackwardData ||
-      AlgorithmsCacheMap cache;
+        algo_type == AlgorithmType::kConvBackwardFilter) {
-      auto_tune_map_[key] = cache;
+      int64_t key = static_cast<int64_t>(algo_type);
+      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+        CudnnAlgorithmsCacheMap cache;
+        cudnn_auto_tune_map_[key] = cache;
+      }
+    } else {
+      int64_t key = static_cast<int64_t>(algo_type);
+      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+        AlgorithmsCacheMap cache;
+        auto_tune_map_[key] = cache;
+      }
    }
  }
  AlgorithmsTypeMap auto_tune_map_;
+  CudnnAlgorithmsTypeMap cudnn_auto_tune_map_;
  std::shared_ptr<std::mutex> autotune_cache_mutex_;
  int64_t total_cache_hits_{0};
  int64_t total_cache_misses_{0};

--- a/paddle/phi/kernels/autotune/cache_test.cc
+++ b/paddle/phi/kernels/autotune/cache_test.cc
@@ -34,20 +34,23 @@ TEST(AlgosCache, AlgosCache) {
  std::vector<int> dilations = {1, 1};
  phi::DataType dtype = paddle::experimental::CppTypeToDataType<float>::Type();
-  auto key = phi::autotune::ConvKey(
+  phi::autotune::ConvCacheKey key(
-      x_shape, w_shape, paddings, strides, dilations, dtype);
+      x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0);
  EXPECT_EQ(cache.Find(key), false);
-  cache.Set(key, ConvAlgos::GEMMKernel);
+  phi::autotune::DnnNode node(static_cast<int64_t>(ConvAlgos::GEMMKernel), 0);
+  cache.Set(key, node);
  EXPECT_EQ(cache.Size(), 1);
  EXPECT_EQ(cache.Find(key), true);
  auto algo = cache.Get(key);
-  EXPECT_EQ(algo, ConvAlgos::GEMMKernel);
+  EXPECT_EQ(algo.algo, ConvAlgos::GEMMKernel);
  x_shape = {4, 128, 128, 3};
-  key = phi::autotune::ConvKey(
+  phi::autotune::ConvCacheKey key1(
-      x_shape, w_shape, paddings, strides, dilations, dtype);
+      x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1);
-  EXPECT_EQ(cache.Find(key), false);
+  EXPECT_EQ(cache.Find(key1), false);
-  cache.Set(key, ConvAlgos::CuDNNKernel_1);
+  phi::autotune::DnnNode node1(static_cast<int64_t>(ConvAlgos::CuDNNKernel_1),
+                               0);
+  cache.Set(key1, node1);
  EXPECT_EQ(cache.Size(), 2);
  EXPECT_EQ(cache.CacheHits(), 1);
  EXPECT_EQ(cache.CacheMisses(), 2);

--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -254,6 +254,8 @@ void ConvCudnnGradGradKernel(
  auto dtype = paddle::platform::CudnnDataType<T>::type;
  auto handle = ctx.cudnn_handle();
+  auto layout = paddle::platform::GetCudnnTensorFormat(
+      paddle::platform::DataLayout::kNCHW);
  paddle::operators::ConvArgs args1{&transformed_ddX,
                                    W,
@@ -261,28 +263,36 @@ void ConvCudnnGradGradKernel(
                                    strides,
                                    padding_common,
                                    dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
  paddle::operators::ConvArgs args2{&transformed_X,
                                    ddW,
                                    &transformed_ddO_channel,
                                    strides,
                                    padding_common,
                                    dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
  paddle::operators::ConvArgs args3{&transformed_ddX,
                                    dW,
                                    &transformed_dO_channel,
                                    strides,
                                    padding_common,
                                    dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
  paddle::operators::ConvArgs args4{&transformed_dX,
                                    ddW,
                                    &transformed_dO_channel,
                                    strides,
                                    padding_common,
                                    dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
 #ifdef PADDLE_WITH_HIP
  paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
@@ -298,9 +308,6 @@ void ConvCudnnGradGradKernel(
      filter_result;
 #endif
-  auto layout = paddle::platform::GetCudnnTensorFormat(
-      paddle::platform::DataLayout::kNCHW);
  // ddo = conv(ddI, W) + conv(I, ddW)
  size_t workspace_size = 0;

--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -251,27 +251,33 @@ void ConvCudnnGradKernel(const Context& ctx,
  T* input_grad_data = nullptr;
  T* transformed_input_grad_data = nullptr;
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
  paddle::operators::ConvArgs args1{&transformed_input_grad,
                                    &transformed_filter_channel,
                                    &transformed_output_grad_channel,
                                    strides,
                                    padding_common,
                                    dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
  paddle::operators::ConvArgs args2{&transformed_input,
                                    &transformed_filter_grad_channel,
                                    &transformed_output_grad_channel,
                                    strides,
                                    padding_common,
                                    dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
  auto handle = ctx.cudnn_handle();
  // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
-  paddle::platform::DataLayout layout =
-      compute_format == paddle::platform::DataLayout::kNHWC
-          ? paddle::platform::DataLayout::kNHWC
-          : paddle::platform::DataLayout::kNCHW;
  if (transformed_input.dims().size() == 5) {
    layout = compute_format == paddle::platform::DataLayout::kNHWC
                 ? paddle::platform::DataLayout::kNDHWC
@@ -368,8 +374,7 @@ void ConvCudnnGradKernel(const Context& ctx,
    using search1 =
        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
    bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-    workspace_size_d = std::max(
+    workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
-        workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo));
 #endif
  }
@@ -400,8 +405,7 @@ void ConvCudnnGradKernel(const Context& ctx,
        search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
    VLOG(3) << "filter algo: " << filter_result.algo << ", time "
            << filter_result.time;
-    workspace_size_w = std::max(
+    workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size);
-        workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo));
 #endif
  }

--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -213,7 +213,9 @@ void ConvCudnnKernel(const Context& ctx,
                                   strides,
                                   padding_common,
                                   dilations,
-                                   dtype};
+                                   dtype,
+                                   groups,
+                                   compute_format};
  auto handle = ctx.cudnn_handle();
  auto workspace_handle = ctx.cudnn_workspace_handle();
@@ -314,7 +316,7 @@ void ConvCudnnKernel(const Context& ctx,
  using search =
      paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
  fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-  workspace_size = search::GetWorkspaceSize(args, fwd_result.algo);
+  workspace_size = fwd_result.workspace_size;
 #endif
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)

--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -179,14 +179,18 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                    strides,
                                    padding_common,
                                    dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
  paddle::operators::ConvArgs args2{&transformed_dout,
                                    &filter,
                                    &x_transpose,
                                    strides,
                                    padding_common,
                                    dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
 #ifdef PADDLE_WITH_HIP
  paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
@@ -625,6 +629,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
  auto dtype = paddle::platform::CudnnDataType<T>::type;
  auto handle = ctx.cudnn_handle();
+  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
  paddle::operators::ConvArgs args1{&transformed_ddout_channel,
                                    &filter,
@@ -632,14 +637,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                    strides,
                                    padding_common,
                                    dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
  paddle::operators::ConvArgs args2{&transformed_ddout_channel,
                                    &ddfilter,
                                    &transformed_x,
                                    strides,
                                    padding_common,
                                    dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
  paddle::operators::ConvArgs args3{&transformed_dout,
                                    dfilter,
@@ -647,14 +656,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                    strides,
                                    padding_common,
                                    dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
  paddle::operators::ConvArgs args4{&transformed_dout,
                                    &ddfilter,
                                    &transformed_dx_channel,
                                    strides,
                                    padding_common,
                                    dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
 #ifdef PADDLE_WITH_HIP
  paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
  paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
@@ -669,8 +682,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
  paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
 #endif
-  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
  // ddo = conv(ddI, filter) + conv(I, ddfilter)
  size_t workspace_size = 0;

--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -205,7 +205,9 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
                                   strides,
                                   padding_common,
                                   dilations_,
-                                   dtype};
+                                   dtype,
+                                   groups,
+                                   data_layout};
  args.handle = handle;
  args.idesc.set(transformed_out, iwo_groups);
  args.wdesc.set(filter, layout_tensor, iwo_groups);

--- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
@@ -71,11 +71,8 @@ class TestAutoTune(unittest.TestCase):
        }
        if paddle.is_compiled_with_cuda():
            # Total 3 * num_iters cache accesses, only iter 2 hits the cache.
-            if enable_autotune and step_id >= 1:
+            expected_res["cache_size"] = 3
-                expected_res["cache_size"] = 3
+            expected_res["cache_hit_rate"] = (step_id + 0.0) / (step_id + 1.0)
-            if enable_autotune and step_id == 2:
-                expected_res["cache_hit_rate"] = np.round(
-                    float(3) / float(9), 5)
        return expected_res
    def test_autotune(self):
@@ -91,11 +88,11 @@ class TestAutoTune(unittest.TestCase):
    def check_status(self, expected_res):
        status = paddle.fluid.core.autotune_status()
        for key in status.keys():
+            v = status[key]
            if key == "cache_hit_rate":
-                v = np.round(status[key], 5)
+                self.assertTrue(np.allclose(v, expected_res[key]))
            else:
-                v = status[key]
+                self.assertEqual(v, expected_res[key])
-            self.assertEqual(v, expected_res[key])
 class TestDygraphAutoTuneStatus(TestAutoTune):