From 1cd7e68b34487f06fb52acf5381fdd533425b9e0 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 25 Aug 2022 17:48:35 +0800
Subject: [PATCH] optimize conv algo cache (#41891)

* optimizer conv alog speed

* code polish

* remove useless code

* fix compile error

* fix cpu compile error

* not use cudnn alog t

* add search cache max number

* polish code

* fix cache test bug

* add groups data format to conv args

* fix cache test bug

* fix cudnn_deterministic bug

* fix test switch auto tune bug

* fix test swith autotune bug;

* fix conv cache bug

* fix cache test error

* fix cache test bug

* fix windows mac compile error

* fix workspace search error

* update cudnn cache

* fix cache test bug; test=develop

* fix autotune swith test error

* polish code

* oplish code
---
 paddle/fluid/operators/conv_base_helper.h     |  40 +++-
 paddle/fluid/operators/conv_cudnn_helper.h    |  74 ++++---
 paddle/fluid/platform/flags.cc                |  11 +
 paddle/phi/kernels/autotune/cache.cc          |  28 ++-
 paddle/phi/kernels/autotune/cache.h           | 191 ++++++++++++++++--
 paddle/phi/kernels/autotune/cache_test.cc     |  19 +-
 .../kernels/gpudnn/conv_grad_grad_kernel.cu   |  21 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu |  24 ++-
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |   6 +-
 .../gpudnn/conv_transpose_grad_kernel.cu      |  27 ++-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |   4 +-
 .../tests/unittests/test_switch_autotune.py   |  13 +-
 12 files changed, 347 insertions(+), 111 deletions(-)
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
index 8425dcb521..b52936c197 100644
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -44,7 +44,13 @@ struct SearchAlgorithm {};
 template <typename AlgoT>
 struct SearchResult {
   SearchResult() {}
+  explicit SearchResult(const phi::autotune::DnnNode& node)
+      : algo(static_cast<AlgoT>(node.algo)),
+        workspace_size(node.workspace_size) {}
+
   explicit SearchResult(AlgoT a) : algo(a) {}
+  explicit SearchResult(AlgoT a, float t, size_t size)
+      : algo(a), time(t), workspace_size(size) {}
 
   AlgoT algo = static_cast<AlgoT>(0);
   float time = -1.f;
@@ -76,28 +82,50 @@ struct ConvArgsBase {
   // dilations
   std::vector<int> d;
 
+  // groups
+  int group;
+
+  // data foramt
+  DataLayout data_layout;
+
   ConvArgsBase(const framework::Tensor* x,
                const framework::Tensor* w,
                const framework::Tensor* o,
                const std::vector<int> s,
                const std::vector<int> p,
                const std::vector<int> d,
-               DataT dtype)
-      : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
+               DataT dtype,
+               int g,
+               DataLayout layout)
+      : x(x),
+        w(w),
+        o(o),
+        s(s),
+        p(p),
+        d(d),
+        cudnn_dtype(dtype),
+        group(g),
+        data_layout(layout) {}
 
   template <typename T>
-  size_t GetCacheKey() const {
+  phi::autotune::ConvCacheKey Convert2ConvCacheKey() const {
     auto x_shape = phi::vectorize(x->dims());
     auto w_shape = phi::vectorize(w->dims());
     VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
-             << ", strides=" << s << ", paddings=" << p << ", dilations=" << d;
-    return phi::autotune::ConvKey(
+             << ", strides=" << s << ", paddings=" << p << ", dilations=" << d
+             << ",data= " << paddle::experimental::CppTypeToDataType<T>::Type()
+             << ", group=" << group
+             << ", data layout=" << static_cast<int64_t>(data_layout);
+
+    return phi::autotune::ConvCacheKey(
         x_shape,
         w_shape,
         p,
         s,
         d,
-        paddle::experimental::CppTypeToDataType<T>::Type());
+        paddle::experimental::CppTypeToDataType<T>::Type(),
+        group,
+        static_cast<int64_t>(data_layout));
   }
 };
 
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 1b8d421d13..e6fcf2be28 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -191,32 +191,36 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     SetConvMathType(ctx, dtype, args.cdesc);
 
     if (deterministic) {
-      result = FindAlgoDeterministic();
+      result = FindAlgoDeterministic(args);
     } else {
       // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
       // 2. Once turning on auto-tune, runn heuristic search(default) before
       //    auto-tune process, run exhaustive_search during mentioned process.
       // 3. After auto-tune process, run cached algorithm if cached, run
       //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
+      auto key = args.Convert2ConvCacheKey<T>();
       auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
       if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
       } else {
         bool use_autotune =
             phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
         if (exhaustive_search || use_autotune) {
           result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
         } else {
           result = FindAlgoHeuristic(args, ctx);
         }
+        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
+                                    result.workspace_size);
+        cache.Set(key, node);
       }
     }
     VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
             << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+            << ", choose algo=" << result.algo
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
     return result;
   }
 
@@ -236,8 +240,9 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
   }
 
  private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
-    return SearchResult<AlgoT>(static_cast<AlgoT>(1));
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
+    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
+    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
   }
 
   // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
@@ -298,6 +303,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
             workspace_size_limit,
             &(result.algo)));
 #endif
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -343,6 +349,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     ChooseAlgoByWorkspace<PerfT, AlgoT>(
         perf_results, workspace_size_limit, &result);
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -394,33 +401,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     SetConvMathType(ctx, dtype, args.cdesc);
 
     if (deterministic) {
-      result = FindAlgoDeterministic();
+      result = FindAlgoDeterministic(args);
     } else {
       // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
       // 2. Once turning on auto-tune, runn heuristic search(default) before
       //    auto-tune process, run exhaustive_search during mentioned process.
       // 3. After auto-tune process, run cached algorithm if cached, run
       //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
+      auto key = args.Convert2ConvCacheKey<T>();
       auto& cache =
           phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
       if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
       } else {
         bool use_autotune =
             phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
         if (exhaustive_search || use_autotune) {
           result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
         } else {
           result = FindAlgoHeuristic(args, ctx);
         }
+        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
+                                    result.workspace_size);
+        cache.Set(key, node);
       }
     }
     VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
             << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+            << ", choose algo=" << result.algo
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
     return result;
   }
 
@@ -440,8 +451,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
   }
 
  private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
-    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
+    auto workspace_size =
+        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
+    return SearchResult<AlgoT>(
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
   }
 
   static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -513,7 +527,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
             workspace_size_limit,
             &(result.algo)));
 #endif
-
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -559,6 +573,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     ChooseAlgoByWorkspace<PerfT, AlgoT>(
         perf_results, workspace_size_limit, &result);
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -609,33 +624,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     SetConvMathType(ctx, dtype, args.cdesc);
 
     if (deterministic) {
-      result = FindAlgoDeterministic();
+      result = FindAlgoDeterministic(args);
     } else {
       // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
       // 2. Once turning on auto-tune, runn heuristic search(default) before
       //    auto-tune process, run exhaustive_search during mentioned process.
       // 3. After auto-tune process, run cached algorithm if cached, run
       //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
+      auto key = args.Convert2ConvCacheKey<T>();
       auto& cache =
           phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
       if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
       } else {
         bool use_autotune =
             phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
         if (exhaustive_search || use_autotune) {
           result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
         } else {
           result = FindAlgoHeuristic(args, ctx);
         }
+        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
+                                    result.workspace_size);
+        cache.Set(key, node);
       }
     }
     VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
             << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+            << ", choose algo=" << result.algo
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
     return result;
   }
 
@@ -656,8 +675,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   }
 
  private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
-    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
+    auto workspace_size =
+        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
+    return SearchResult<AlgoT>(
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
   }
 
   static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -718,6 +740,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
             &(result.algo)));
 #endif
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -786,6 +809,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       ChooseAlgo(perf_results, workspace_size_limit, &result);
     }
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 54063e1192..28dddc1fbe 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -984,6 +984,17 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
  */
 PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
 
+/**
+ * Conv Search cache max number related FLAG
+ * Name: FLAGS_search_cache_max_number
+ * Since Version: 2.3.0
+ * Value Range: int32, default=1000000
+ * Example:
+ */
+PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
+                             1000000,
+                             "search_cache_max_number.");
+
 /**
  * Preformance related FLAG
  * Name: einsum_opt
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
index 838f2dd265..ad7a2b134a 100644
--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -21,21 +21,6 @@
 namespace phi {
 namespace autotune {
 
-// Define the cache key of operator
-size_t ConvKey(const std::vector<int64_t>& x_dims,
-               const std::vector<int64_t>& w_dims,
-               const std::vector<int>& strides,
-               const std::vector<int>& paddings,
-               const std::vector<int>& dilations,
-               phi::DataType dtype) {
-  return GetKey(x_dims,
-                w_dims,
-                strides,
-                paddings,
-                dilations,
-                static_cast<int64_t>(dtype));
-}
-
 size_t TransposeKey(const std::vector<int64_t>& x_dims,
                     const std::vector<int32_t>& perm,
                     phi::DataType dtype) {
@@ -73,6 +58,19 @@ void AutoTuneCache::UpdateStatus() {
     cache_hits += v.second.CacheHits();
     cache_misses += v.second.CacheMisses();
   }
+
+  for (auto& v : cudnn_auto_tune_map_) {
+    VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width)
+            << AlgorithmTypeString(v.first)
+            << " Cache Size: " << v.second.Size()
+            << " Hits: " << v.second.CacheHits()
+            << " Misses: " << v.second.CacheMisses()
+            << " Hit Rate: " << v.second.CacheHitRate();
+    size += v.second.Size();
+    cache_hits += v.second.CacheHits();
+    cache_misses += v.second.CacheMisses();
+  }
+
   total_size_ = size;
   total_cache_hits_ = cache_hits;
   total_cache_misses_ = cache_misses;
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 1263cf40e5..aacebc6657 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -24,6 +24,8 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
 
+DECLARE_int32(search_cache_max_number);
+
 inline void HashCombine(std::size_t* seed) {}
 
 // combine hash value
@@ -32,6 +34,7 @@ template <typename T, typename... Rest>
 inline void HashCombine(std::size_t* seed, const T& v, Rest... rest) {
   std::hash<T> hasher;
   *seed ^= hasher(v) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
+  *seed *= 0x00000100000001B3;
   HashCombine(seed, rest...);
 }
 
@@ -41,7 +44,7 @@ namespace std {
 template <typename T>
 struct hash<std::vector<T>> {
   std::size_t operator()(std::vector<T> const& vec) const noexcept {
-    std::size_t seed = 0;
+    std::size_t seed = 0xcbf29ce484222325;
     for (auto val : vec) {
       HashCombine(&seed, val);
     }
@@ -53,6 +56,14 @@ struct hash<std::vector<T>> {
 namespace phi {
 namespace autotune {
 
+struct DnnNode {
+  DnnNode() {}
+  explicit DnnNode(int64_t a, size_t size) : algo(a), workspace_size(size) {}
+
+  int64_t algo;
+  size_t workspace_size = 0;
+};
+
 template <typename... Args>
 size_t GetKey(Args&&... args) {
   size_t seed = 0;
@@ -60,13 +71,130 @@ size_t GetKey(Args&&... args) {
   return seed;
 }
 
-// Define the cache key of operator
-size_t ConvKey(const std::vector<int64_t>& x_dims,
-               const std::vector<int64_t>& w_dims,
-               const std::vector<int>& strides,
-               const std::vector<int>& paddings,
-               const std::vector<int>& dilations,
-               phi::DataType dtype);
+struct ConvCacheKey {
+  ConvCacheKey() {}
+  explicit ConvCacheKey(const std::vector<int64_t>& x_dims,
+                        const std::vector<int64_t>& w_dims,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        phi::DataType dtype,
+                        int groups,
+                        int64_t data_layout)
+      : x_dims_(x_dims),
+        w_dims_(w_dims),
+        strides_(strides),
+        paddings_(paddings),
+        dilations_(dilations),
+        dtype_(dtype),
+        groups_(groups),
+        data_layout_(data_layout) {}
+  size_t hash_value() const {
+    return GetKey(x_dims_,
+                  w_dims_,
+                  strides_,
+                  paddings_,
+                  dilations_,
+                  static_cast<int64_t>(dtype_),
+                  groups_,
+                  data_layout_);
+  }
+  std::vector<int64_t> x_dims_;
+  std::vector<int64_t> w_dims_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> dilations_;
+  phi::DataType dtype_;
+  int groups_;
+  int64_t data_layout_;
+};
+
+struct ConvCacheKeyHash {
+  size_t operator()(const ConvCacheKey& cache) const {
+    return cache.hash_value();
+  }
+};
+
+struct ConvCacheKeyEqual {
+  size_t operator()(const ConvCacheKey& first,
+                    const ConvCacheKey& second) const {
+    if (first.x_dims_ != second.x_dims_) return false;
+    if (first.w_dims_ != second.w_dims_) return false;
+    if (first.strides_ != second.strides_) return false;
+    if (first.paddings_ != second.paddings_) return false;
+    if (first.dilations_ != second.dilations_) return false;
+    if (first.dtype_ != second.dtype_) return false;
+    if (first.groups_ != second.groups_) return false;
+    if (first.data_layout_ != second.data_layout_) return false;
+
+    return true;
+  }
+};
+
+class CudnnAlgorithmsCacheMap {
+ public:
+  CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); }
+
+  DnnNode Get(const ConvCacheKey& key) {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    PADDLE_ENFORCE_NE(
+        hash_.find(key),
+        hash_.end(),
+        phi::errors::PreconditionNotMet("The key does not exist."));
+    return hash_[key];
+  }
+
+  bool Find(const ConvCacheKey& key) {
+    bool ret = false;
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    if (hash_.find(key) != hash_.end()) {
+      cache_hits_++;
+      ret = true;
+    } else {
+      cache_misses_++;
+    }
+    return ret;
+  }
+
+  void Clean() {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    hash_.clear();
+    cache_hits_ = 0;
+    cache_misses_ = 0;
+  }
+
+  void Set(const ConvCacheKey& key, DnnNode algo) {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    if (hash_.size() > static_cast<size_t>(FLAGS_search_cache_max_number)) {
+      hash_.clear();
+    }
+    hash_[key] = algo;
+  }
+
+  int64_t CacheMisses() const { return cache_misses_; }
+
+  int64_t CacheHits() const { return cache_hits_; }
+
+  float CacheHitRate() const {
+    int64_t num_accesses = cache_hits_ + cache_misses_;
+    float cache_hit_rate = 0.;
+    if (num_accesses != 0) {
+      cache_hit_rate =
+          static_cast<float>(cache_hits_) / static_cast<float>(num_accesses);
+    }
+    return cache_hit_rate;
+  }
+
+  int64_t Size() const { return hash_.size(); }
+
+ private:
+  std::unordered_map<ConvCacheKey, DnnNode, ConvCacheKeyHash, ConvCacheKeyEqual>
+      hash_;
+  std::shared_ptr<std::mutex> cache_mutex_;
+
+  int64_t cache_hits_{0};
+  int64_t cache_misses_{0};
+};
 
 size_t TransposeKey(const std::vector<int64_t>& x_dims,
                     const std::vector<int32_t>& perm,
@@ -77,7 +205,7 @@ class AlgorithmsCache {
  public:
   AlgorithmsCache() : cache_mutex_(new std::mutex()) { hash_.clear(); }
 
-  AlgorithmT Get(size_t key) {
+  AlgorithmT Get(const size_t& key) {
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     PADDLE_ENFORCE_NE(
         hash_.find(key),
@@ -86,7 +214,7 @@ class AlgorithmsCache {
     return hash_[key];
   }
 
-  bool Find(size_t key) {
+  bool Find(const size_t& key) {
     bool ret = false;
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     if (hash_.find(key) != hash_.end()) {
@@ -105,7 +233,7 @@ class AlgorithmsCache {
     cache_misses_ = 0;
   }
 
-  void Set(size_t key, AlgorithmT algo) {
+  void Set(const size_t& key, AlgorithmT algo) {
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     hash_[key] = algo;
   }
@@ -143,9 +271,12 @@ enum class AlgorithmType {
 };
 
 // AlgorithmsConfigKey -> AlgorithmsID
+// (todo. hong) use cudnnConvolutionFwdAlgo_t
 using AlgorithmsCacheMap = AlgorithmsCache<int64_t>;
 // AlgorithmType -> AlgorithmsCache
 using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
+using CudnnAlgorithmsTypeMap =
+    std::unordered_map<int64_t, CudnnAlgorithmsCacheMap>;
 
 class AutoTuneCache {
  public:
@@ -158,16 +289,19 @@ class AutoTuneCache {
     return auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
-  AlgorithmsCacheMap& GetConvForward() {
-    return Get(AlgorithmType::kConvForward);
+  CudnnAlgorithmsCacheMap& GetConvForward() {
+    return cudnn_auto_tune_map_[static_cast<int64_t>(
+        AlgorithmType::kConvForward)];
   }
 
-  AlgorithmsCacheMap& GetConvBackwardData() {
-    return Get(AlgorithmType::kConvBackwardData);
+  CudnnAlgorithmsCacheMap& GetConvBackwardData() {
+    return cudnn_auto_tune_map_[static_cast<int64_t>(
+        AlgorithmType::kConvBackwardData)];
   }
 
-  AlgorithmsCacheMap& GetConvBackwardFilter() {
-    return Get(AlgorithmType::kConvBackwardFilter);
+  CudnnAlgorithmsCacheMap& GetConvBackwardFilter() {
+    return cudnn_auto_tune_map_[static_cast<int64_t>(
+        AlgorithmType::kConvBackwardFilter)];
   }
 
   AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
@@ -176,6 +310,10 @@ class AutoTuneCache {
     for (auto& v : auto_tune_map_) {
       v.second.Clean();
     }
+
+    for (auto& v : cudnn_auto_tune_map_) {
+      v.second.Clean();
+    }
   }
 
   void UpdateStatus();
@@ -206,14 +344,25 @@ class AutoTuneCache {
 
   void Register(const AlgorithmType& algo_type) {
     std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
-    int64_t key = static_cast<int64_t>(algo_type);
-    if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
-      AlgorithmsCacheMap cache;
-      auto_tune_map_[key] = cache;
+    if (algo_type == AlgorithmType::kConvForward ||
+        algo_type == AlgorithmType::kConvBackwardData ||
+        algo_type == AlgorithmType::kConvBackwardFilter) {
+      int64_t key = static_cast<int64_t>(algo_type);
+      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+        CudnnAlgorithmsCacheMap cache;
+        cudnn_auto_tune_map_[key] = cache;
+      }
+    } else {
+      int64_t key = static_cast<int64_t>(algo_type);
+      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+        AlgorithmsCacheMap cache;
+        auto_tune_map_[key] = cache;
+      }
     }
   }
 
   AlgorithmsTypeMap auto_tune_map_;
+  CudnnAlgorithmsTypeMap cudnn_auto_tune_map_;
   std::shared_ptr<std::mutex> autotune_cache_mutex_;
   int64_t total_cache_hits_{0};
   int64_t total_cache_misses_{0};
diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc
index 53574c3d0c..29affd45f0 100644
--- a/paddle/phi/kernels/autotune/cache_test.cc
+++ b/paddle/phi/kernels/autotune/cache_test.cc
@@ -34,20 +34,23 @@ TEST(AlgosCache, AlgosCache) {
   std::vector<int> dilations = {1, 1};
   phi::DataType dtype = paddle::experimental::CppTypeToDataType<float>::Type();
 
-  auto key = phi::autotune::ConvKey(
-      x_shape, w_shape, paddings, strides, dilations, dtype);
+  phi::autotune::ConvCacheKey key(
+      x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0);
   EXPECT_EQ(cache.Find(key), false);
-  cache.Set(key, ConvAlgos::GEMMKernel);
+  phi::autotune::DnnNode node(static_cast<int64_t>(ConvAlgos::GEMMKernel), 0);
+  cache.Set(key, node);
   EXPECT_EQ(cache.Size(), 1);
   EXPECT_EQ(cache.Find(key), true);
   auto algo = cache.Get(key);
-  EXPECT_EQ(algo, ConvAlgos::GEMMKernel);
+  EXPECT_EQ(algo.algo, ConvAlgos::GEMMKernel);
 
   x_shape = {4, 128, 128, 3};
-  key = phi::autotune::ConvKey(
-      x_shape, w_shape, paddings, strides, dilations, dtype);
-  EXPECT_EQ(cache.Find(key), false);
-  cache.Set(key, ConvAlgos::CuDNNKernel_1);
+  phi::autotune::ConvCacheKey key1(
+      x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1);
+  EXPECT_EQ(cache.Find(key1), false);
+  phi::autotune::DnnNode node1(static_cast<int64_t>(ConvAlgos::CuDNNKernel_1),
+                               0);
+  cache.Set(key1, node1);
   EXPECT_EQ(cache.Size(), 2);
   EXPECT_EQ(cache.CacheHits(), 1);
   EXPECT_EQ(cache.CacheMisses(), 2);
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index ef70907b59..fb9580427e 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -254,6 +254,8 @@ void ConvCudnnGradGradKernel(
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
   auto handle = ctx.cudnn_handle();
+  auto layout = paddle::platform::GetCudnnTensorFormat(
+      paddle::platform::DataLayout::kNCHW);
 
   paddle::operators::ConvArgs args1{&transformed_ddX,
                                     W,
@@ -261,28 +263,36 @@ void ConvCudnnGradGradKernel(
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
   paddle::operators::ConvArgs args2{&transformed_X,
                                     ddW,
                                     &transformed_ddO_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
   paddle::operators::ConvArgs args3{&transformed_ddX,
                                     dW,
                                     &transformed_dO_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
   paddle::operators::ConvArgs args4{&transformed_dX,
                                     ddW,
                                     &transformed_dO_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
 
 #ifdef PADDLE_WITH_HIP
   paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
@@ -298,9 +308,6 @@ void ConvCudnnGradGradKernel(
       filter_result;
 #endif
 
-  auto layout = paddle::platform::GetCudnnTensorFormat(
-      paddle::platform::DataLayout::kNCHW);
-
   // ddo = conv(ddI, W) + conv(I, ddW)
   size_t workspace_size = 0;
 
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 4e9c37879c..bc7a8b4f37 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -251,27 +251,33 @@ void ConvCudnnGradKernel(const Context& ctx,
   T* input_grad_data = nullptr;
   T* transformed_input_grad_data = nullptr;
 
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+
   paddle::operators::ConvArgs args1{&transformed_input_grad,
                                     &transformed_filter_channel,
                                     &transformed_output_grad_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
   paddle::operators::ConvArgs args2{&transformed_input,
                                     &transformed_filter_grad_channel,
                                     &transformed_output_grad_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
 
   auto handle = ctx.cudnn_handle();
   // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
-  paddle::platform::DataLayout layout =
-      compute_format == paddle::platform::DataLayout::kNHWC
-          ? paddle::platform::DataLayout::kNHWC
-          : paddle::platform::DataLayout::kNCHW;
+
   if (transformed_input.dims().size() == 5) {
     layout = compute_format == paddle::platform::DataLayout::kNHWC
                  ? paddle::platform::DataLayout::kNDHWC
@@ -368,8 +374,7 @@ void ConvCudnnGradKernel(const Context& ctx,
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
     bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-    workspace_size_d = std::max(
-        workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo));
+    workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
 #endif
   }
 
@@ -400,8 +405,7 @@ void ConvCudnnGradKernel(const Context& ctx,
         search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
     VLOG(3) << "filter algo: " << filter_result.algo << ", time "
             << filter_result.time;
-    workspace_size_w = std::max(
-        workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo));
+    workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size);
 #endif
   }
 
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index bd95a32bc7..aa591a34a4 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -213,7 +213,9 @@ void ConvCudnnKernel(const Context& ctx,
                                    strides,
                                    padding_common,
                                    dilations,
-                                   dtype};
+                                   dtype,
+                                   groups,
+                                   compute_format};
 
   auto handle = ctx.cudnn_handle();
   auto workspace_handle = ctx.cudnn_workspace_handle();
@@ -314,7 +316,7 @@ void ConvCudnnKernel(const Context& ctx,
   using search =
       paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
   fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-  workspace_size = search::GetWorkspaceSize(args, fwd_result.algo);
+  workspace_size = fwd_result.workspace_size;
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 0ce16f66be..626ef79d56 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -179,14 +179,18 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
   paddle::operators::ConvArgs args2{&transformed_dout,
                                     &filter,
                                     &x_transpose,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
 
 #ifdef PADDLE_WITH_HIP
   paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
@@ -625,6 +629,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
   auto handle = ctx.cudnn_handle();
+  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
 
   paddle::operators::ConvArgs args1{&transformed_ddout_channel,
                                     &filter,
@@ -632,14 +637,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
   paddle::operators::ConvArgs args2{&transformed_ddout_channel,
                                     &ddfilter,
                                     &transformed_x,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
 
   paddle::operators::ConvArgs args3{&transformed_dout,
                                     dfilter,
@@ -647,14 +656,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
   paddle::operators::ConvArgs args4{&transformed_dout,
                                     &ddfilter,
                                     &transformed_dx_channel,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
 #ifdef PADDLE_WITH_HIP
   paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
   paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
@@ -669,8 +682,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
 #endif
 
-  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
-
   // ddo = conv(ddI, filter) + conv(I, ddfilter)
   size_t workspace_size = 0;
 
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 58ead4c328..2289541da1 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -205,7 +205,9 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
                                    strides,
                                    padding_common,
                                    dilations_,
-                                   dtype};
+                                   dtype,
+                                   groups,
+                                   data_layout};
   args.handle = handle;
   args.idesc.set(transformed_out, iwo_groups);
   args.wdesc.set(filter, layout_tensor, iwo_groups);
diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
index a22df61ace..3daf8752d8 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
@@ -71,11 +71,8 @@ class TestAutoTune(unittest.TestCase):
         }
         if paddle.is_compiled_with_cuda():
             # Total 3 * num_iters cache accesses, only iter 2 hits the cache.
-            if enable_autotune and step_id >= 1:
-                expected_res["cache_size"] = 3
-            if enable_autotune and step_id == 2:
-                expected_res["cache_hit_rate"] = np.round(
-                    float(3) / float(9), 5)
+            expected_res["cache_size"] = 3
+            expected_res["cache_hit_rate"] = (step_id + 0.0) / (step_id + 1.0)
         return expected_res
 
     def test_autotune(self):
@@ -91,11 +88,11 @@ class TestAutoTune(unittest.TestCase):
     def check_status(self, expected_res):
         status = paddle.fluid.core.autotune_status()
         for key in status.keys():
+            v = status[key]
             if key == "cache_hit_rate":
-                v = np.round(status[key], 5)
+                self.assertTrue(np.allclose(v, expected_res[key]))
             else:
-                v = status[key]
-            self.assertEqual(v, expected_res[key])
+                self.assertEqual(v, expected_res[key])
 
 
 class TestDygraphAutoTuneStatus(TestAutoTune):
-- 
GitLab