diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index da326ff7d76d71679d056ed7a3e89dd44c3fd6a8..53ac895bfbccb94783ddf5ff0ddb4e81a07f8afb 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -15,7 +15,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_subdirectory(pylayer)
     cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
     add_dependencies(grad_tensor_holder eager_final_state_codegen)
-    cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
+    cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
 endif()
 
 cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 3d8a5ab21f00fcc4137d177b741023a827e325d7..69cd45222cef4985cff120c24a0025fa2a754366 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -9,8 +9,8 @@ cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_f
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper)
 cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper)
-cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
-cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
+cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune)
+cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune)
 cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
 if(NOT WIN32)
     if(WITH_NCCL OR WITH_RCCL)
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
index c664d1935fe2e33819c93178f510d2547fcc4954..9e1a323fc9f3d67e3346dfaa255a6f6c09223747 100644
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/autotune/cache.h"
 
 namespace paddle {
 namespace operators {
@@ -41,12 +42,22 @@ struct SearchAlgorithm {};
 // As the container of searchAlgorithm::Find() result.
 template <typename AlgoT>
 struct SearchResult {
- public:
+  SearchResult() {}
+  explicit SearchResult(AlgoT a) : algo(a) {}
+
   AlgoT algo = static_cast<AlgoT>(0);
   float time = -1.f;
   size_t workspace_size = 0;
 };
 
+template <typename T>
+static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
+  out << "[";
+  for (auto const& tmp : v) out << tmp << ",";
+  out << "]";
+  return out;
+}
+
 // As the container of conv relevant descriptors.
 template <typename HandleT, typename DataT>
 struct ConvArgsBase {
@@ -68,6 +79,17 @@ struct ConvArgsBase {
                const framework::Tensor* o, const std::vector<int> s,
                const std::vector<int> p, const std::vector<int> d, DataT dtype)
       : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
+
+  template <typename T>
+  size_t GetCacheKey() const {
+    auto x_shape = phi::vectorize(x->dims());
+    auto w_shape = phi::vectorize(w->dims());
+    VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
+             << ", strides=" << s << ", paddings=" << p << ", dilations=" << d;
+    return phi::autotune::ConvKey(
+        x_shape, w_shape, p, s, d,
+        paddle::experimental::CppTypeToDataType<T>::Type());
+  }
 };
 
 static inline void GetNCDHW(const framework::DDim& dims,
@@ -87,13 +109,5 @@ static inline void GetNCDHW(const framework::DDim& dims,
   }
 }
 
-template <typename T>
-static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
-  out << "[";
-  for (auto const& tmp : v) out << tmp << ",";
-  out << "]";
-  return out;
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 1311f812be118b2b7bfce55e9b95beece7c48dea..419fb8a4ca7030b0b45e058403e45c55f6c0f3ba 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_base_helper.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
 namespace paddle {
@@ -67,20 +69,16 @@ static inline bool UseFixedWorkspace() {
   return FLAGS_conv_workspace_size_limit >= 0;
 }
 
-static size_t CaclWorkspaceLimitInBytes(const phi::GPUContext& ctx) {
-  if (!UseFixedWorkspace()) {
+static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
+  if (!use_fixed_workspace) {
     int device_id = platform::GetCurrentDeviceId();
     int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
     int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
     int64_t availble = platform::GpuAvailableMemToAlloc();
-    int64_t cur_workspace_size = ctx.cudnn_workspace_handle().WorkspaceSize();
     VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
             << " MB, reserved=" << ToMegaBytes(reserved)
-            << " MB, available_to_alloc=" << ToMegaBytes(availble)
-            << " MB, current_workspace_size=" << ToMegaBytes(cur_workspace_size)
-            << " MB.";
-    return std::max(std::max(availble, cur_workspace_size),
-                    reserved - allocated);
+            << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
+    return std::max(availble, reserved - allocated);
   } else {
     return FLAGS_conv_workspace_size_limit * 1024 * 1024;
   }
@@ -104,26 +102,44 @@ std::string GetPerfResultString(std::string prefix,
   return out.str();
 }
 
+// Choose an algorithm which has the minimize time cost and less memory.
+// NOTE: perf_results is ordered by time.
 template <typename PerfT, typename AlgoT>
 void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
                            size_t workspace_limit,
-                           SearchResult<AlgoT>* algo_result) {
+                           SearchResult<AlgoT>* search_result) {
+  int best_algo_idx = -1;
   for (size_t i = 0; i < perf_results.size(); ++i) {
     auto result = perf_results[i];
     if (result.status == CUDNN_STATUS_SUCCESS &&
         result.memory < workspace_limit) {
-      algo_result->algo = result.algo;
-      algo_result->time = result.time;
-      algo_result->workspace_size = result.memory;
-      VLOG(3) << "  algo=" << result.algo << ", time=" << result.time
-              << " ms, memory=" << ToMegaBytes(result.memory)
-              << " MB (limit=" << ToMegaBytes(workspace_limit)
-              << " MB), status=" << result.status;
-      return;
+      if (best_algo_idx == -1) {
+        // The algorithm which has minimize time cost and need a workspace_size
+        // fitting the workspace_limit constraint.
+        best_algo_idx = i;
+        // Each perf_results[i].time is set to be -1 in heuristic search.
+        if (perf_results[best_algo_idx].time < 0) {
+          break;
+        }
+      } else {
+        float best_algo_time = perf_results[best_algo_idx].time;
+        if ((result.time - best_algo_time) / best_algo_time < 0.01) {
+          best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
+                              ? i
+                              : best_algo_idx;
+          break;
+        }
+      }
     }
   }
-  VLOG(3) << "Can not find an algorithm that requires memory < "
-          << ToMegaBytes(workspace_limit) << " MB";
+  if (best_algo_idx != -1) {
+    search_result->algo = perf_results[best_algo_idx].algo;
+    search_result->time = perf_results[best_algo_idx].time;
+    search_result->workspace_size = perf_results[best_algo_idx].memory;
+  } else {
+    VLOG(3) << "Can not find an algorithm that requires memory < "
+            << ToMegaBytes(workspace_limit) << " MB";
+  }
 }
 
 static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
@@ -151,6 +167,10 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
 #endif
 }
 
+// cuDNN convolution forward algorithm searcher, consisted of three searching
+// modes, namely: deterministic, heuristic and exhaustive_search mode.
+// As well as one workspace size acquirsition function with respect to
+// the chosen alogrithm.
 template <>
 struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
   using PerfT = cudnnConvolutionFwdAlgoPerf_t;
@@ -162,90 +182,30 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
                                   const phi::GPUContext& ctx) {
     SearchResult<AlgoT> result;
     auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
     SetConvMathType(ctx, dtype, args.cdesc);
 
-    if (!exhaustive_search && !deterministic) {
-#if CUDNN_VERSION >= 7001
-      int actual_perf_count;
-      int best_algo_idx = 0;
-      std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
-              args.handle, args.idesc.desc(), args.wdesc.desc(),
-              args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
-              &actual_perf_count, perf_results.data()));
-      result.algo = perf_results[best_algo_idx].algo;
-      result.workspace_size = perf_results[best_algo_idx].memory;
-
-      if (result.workspace_size > workspace_size_limit) {
-#if CUDNN_VERSION >= 8000
-        // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
-        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
-                                            &result);
-#else
-        VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
-                   "becasue the workspace size request("
-                << result.workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-                args.handle, args.idesc.desc(), args.wdesc.desc(),
-                args.cdesc.desc(), args.odesc.desc(),
-                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &(result.algo)));
-#endif
-      }
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-              args.handle, args.idesc.desc(), args.wdesc.desc(),
-              args.cdesc.desc(), args.odesc.desc(),
-              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &(result.algo)));
-#endif
-    } else if (deterministic) {
-      result.algo = static_cast<AlgoT>(1);
+    if (deterministic) {
+      result = FindAlgoDeterministic();
     } else {
-      auto workspace_handle = ctx.cudnn_workspace_handle();
-      auto x_dims = phi::vectorize(args.x->dims());
-      auto w_dims = phi::vectorize(args.w->dims());
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
-               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-      AlgorithmsCache<AlgoT>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetForward());
-
-      result.algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0,
-          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            int returned_algo_count;
-            std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
-            size_t max_workspace_size =
-                FindMaxWorkspaceSize(args, workspace_size_limit);
-            VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
-                    << " MB";
-
-            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                      args.handle, args.idesc.desc(), args.x->data<T>(),
-                      args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
-                      args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
-                      kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      perf_results.data(), cudnn_workspace_ptr,
-                      max_workspace_size));
-            };
-            workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
-                                         UseFixedWorkspace());
-
-            VLOG(4) << GetPerfResultString<PerfT>(
-                "[Exhaustive Search] FwdAlgo Perf result", perf_results,
-                returned_algo_count, workspace_size_limit);
-            result.time = perf_results[0].time;
-            return perf_results[0].algo;
-          });
+      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
+      // 2. Once turning on auto-tune, runn heuristic search(default) before
+      //    auto-tune process, run exhaustive_search during mentioned process.
+      // 3. After auto-tune process, run cached algorithm if cached, run
+      //    default mode for the rest.
+      size_t key = args.GetCacheKey<T>();
+      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
+      if (cache.Find(key)) {
+        result.algo = static_cast<AlgoT>(cache.Get(key));
+      } else {
+        bool use_autotune =
+            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
+        if (exhaustive_search || use_autotune) {
+          result = FindAlgoExhaustiveSearch<T>(args, ctx);
+          cache.Set(key, static_cast<int64_t>(result.algo));
+        } else {
+          result = FindAlgoHeuristic(args, ctx);
+        }
+      }
     }
     VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
             << ", deterministic=" << deterministic
@@ -265,8 +225,95 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
   }
 
  private:
-  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
-                                     size_t workspace_size_limit) {
+  static SearchResult<AlgoT> FindAlgoDeterministic() {
+    return SearchResult<AlgoT>(static_cast<AlgoT>(1));
+  }
+
+  // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
+  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
+                                               const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
+    size_t workspace_size_limit =
+        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
+
+#if CUDNN_VERSION >= 7001
+    int actual_perf_count;
+    int best_algo_idx = 0;
+    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
+            args.handle, args.idesc.desc(), args.wdesc.desc(),
+            args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
+            &actual_perf_count, perf_results.data()));
+    result.algo = perf_results[best_algo_idx].algo;
+    result.workspace_size = perf_results[best_algo_idx].memory;
+
+    if (result.workspace_size > workspace_size_limit) {
+#if CUDNN_VERSION >= 8000
+      // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
+      ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                          &result);
+#else
+      VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
+                 "becasue the workspace size request("
+              << result.workspace_size << ") exceeds the limit("
+              << workspace_size_limit << ")";
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+              args.handle, args.idesc.desc(), args.wdesc.desc(),
+              args.cdesc.desc(), args.odesc.desc(),
+              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &(result.algo)));
+#endif
+    }
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+            args.handle, args.idesc.desc(), args.wdesc.desc(),
+            args.cdesc.desc(), args.odesc.desc(),
+            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
+            &(result.algo)));
+#endif
+    return result;
+  }
+
+  template <typename T>
+  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
+      const ConvArgs& args, const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
+    size_t workspace_size_limit =
+        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
+    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
+    VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
+            << " MB";
+
+    int returned_algo_count;
+    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
+    auto cudnn_find_func = [&](void* workspace_ptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+              args.handle, args.idesc.desc(), args.x->data<T>(),
+              args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
+              args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
+              kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_results.data(),
+              workspace_ptr, max_workspace_size));
+    };
+
+    auto workspace_handle = ctx.cudnn_workspace_handle();
+    workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
+                                 UseFixedWorkspace());
+
+    VLOG(4) << GetPerfResultString<PerfT>(
+        "[Exhaustive Search] FwdAlgo Perf result", perf_results,
+        returned_algo_count, workspace_size_limit);
+    ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                        &result);
+
+    return result;
+  }
+
+  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
+                                    size_t workspace_size_limit) {
     if (!UseFixedWorkspace()) {
       size_t max_workspace_size = 0;
       for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
@@ -288,6 +335,12 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
   }
 };
 
+// cuDNN convolution backward data-algorithm searcher, consisting of three
+// searching modes, namely: deterministic, heuristic, and exhaustive_search
+// mode. Specially, there are 2 pattens of exhaustive search mode, one for
+// HALF precision only, one for the rest.
+// As well as one workspace size acquirsition function with
+// respect to the chosen alogrithm.
 template <>
 struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
   using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
@@ -299,102 +352,31 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
                                   const phi::GPUContext& ctx) {
     SearchResult<AlgoT> result;
     auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
     SetConvMathType(ctx, dtype, args.cdesc);
 
-    if (!exhaustive_search && !deterministic) {
-#if CUDNN_VERSION >= 7001
-      int actual_perf_count;
-      int best_algo_idx = 0;
-      std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
-              args.handle, args.wdesc.desc(), args.odesc.desc(),
-              args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
-              &actual_perf_count, perf_results.data()));
-      result.algo = perf_results[best_algo_idx].algo;
-
-#if CUDNN_VERSION < 7500
-      int stride_dim = args.x->dims().size() - 2;
-      bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim,
-                                   [=](int n) { return n != 1; });
-      if (blacklist && (perf_results[best_algo_idx].algo ==
-                            CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-                        perf_results[best_algo_idx].algo ==
-                            CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
-        result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      }
-#endif
-      result.workspace_size = GetWorkspaceSize(args, result.algo);
-      if (result.workspace_size > workspace_size_limit) {
-#if CUDNN_VERSION >= 8000
-        // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
-        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
-                                            &result);
-#else
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << result.workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-                args.handle, args.wdesc.desc(), args.odesc.desc(),
-                args.cdesc.desc(), args.idesc.desc(),
-                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &(result.algo)));
-#endif
-      }
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-              args.handle, args.wdesc.desc(), args.odesc.desc(),
-              args.cdesc.desc(), args.idesc.desc(),
-              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &(result.algo)));
-#endif
-    } else if (deterministic) {
-      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+    if (deterministic) {
+      result = FindAlgoDeterministic();
     } else {
-      auto workspace_handle = ctx.cudnn_workspace_handle();
-      auto x_dims = phi::vectorize(args.x->dims());
-      auto w_dims = phi::vectorize(args.w->dims());
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t"
-               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-      AlgorithmsCache<AlgoT>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetBackwardData());
-      result.algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0,
-          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            int returned_algo_count;
-            std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
-            size_t max_workspace_size =
-                FindMaxWorkspaceSize(args, workspace_size_limit);
-            VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
-                    << " MB";
-
-            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::
-                      cudnnFindConvolutionBackwardDataAlgorithmEx(
-                          args.handle, args.wdesc.desc(), args.w->data<T>(),
-                          args.odesc.desc(), args.o->data<T>(),
-                          args.cdesc.desc(), args.idesc.desc(),
-                          const_cast<T*>(args.x->data<T>()),
-                          kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
-                          perf_results.data(), cudnn_workspace_ptr,
-                          max_workspace_size));
-            };
-            workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
-                                         UseFixedWorkspace());
-
-            VLOG(3) << GetPerfResultString<PerfT>(
-                "[Exhaustive Search] BwdDataAlgo Perf result", perf_results,
-                returned_algo_count, workspace_size_limit);
-            result.time = perf_results[0].time;
-            return perf_results[0].algo;
-          });
+      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
+      // 2. Once turning on auto-tune, runn heuristic search(default) before
+      //    auto-tune process, run exhaustive_search during mentioned process.
+      // 3. After auto-tune process, run cached algorithm if cached, run
+      //    default mode for the rest.
+      size_t key = args.GetCacheKey<T>();
+      auto& cache =
+          phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
+      if (cache.Find(key)) {
+        result.algo = static_cast<AlgoT>(cache.Get(key));
+      } else {
+        bool use_autotune =
+            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
+        if (exhaustive_search || use_autotune) {
+          result = FindAlgoExhaustiveSearch<T>(args, ctx);
+          cache.Set(key, static_cast<int64_t>(result.algo));
+        } else {
+          result = FindAlgoHeuristic(args, ctx);
+        }
+      }
     }
     VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
             << ", deterministic=" << deterministic
@@ -414,8 +396,106 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
   }
 
  private:
-  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
-                                     size_t workspace_size_limit) {
+  static SearchResult<AlgoT> FindAlgoDeterministic() {
+    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
+  }
+
+  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
+                                               const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
+    size_t workspace_size_limit =
+        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
+
+#if CUDNN_VERSION >= 7001
+    int actual_perf_count;
+    int best_algo_idx = 0;
+    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
+            args.handle, args.wdesc.desc(), args.odesc.desc(),
+            args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
+            &actual_perf_count, perf_results.data()));
+    result.algo = perf_results[best_algo_idx].algo;
+
+#if CUDNN_VERSION < 7500
+    int stride_dim = args.x->dims().size() - 2;
+    bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim,
+                                 [=](int n) { return n != 1; });
+    if (blacklist && (perf_results[best_algo_idx].algo ==
+                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
+                      perf_results[best_algo_idx].algo ==
+                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
+      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+    }
+#endif
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
+    if (result.workspace_size > workspace_size_limit) {
+#if CUDNN_VERSION >= 8000
+      // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
+      ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                          &result);
+#else
+      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                 "the workspace size request("
+              << result.workspace_size << ") exceeds the limit("
+              << workspace_size_limit << ")";
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+              args.handle, args.wdesc.desc(), args.odesc.desc(),
+              args.cdesc.desc(), args.idesc.desc(),
+              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &(result.algo)));
+#endif
+    }
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+            args.handle, args.wdesc.desc(), args.odesc.desc(),
+            args.cdesc.desc(), args.idesc.desc(),
+            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+            workspace_size_limit, &(result.algo)));
+#endif
+
+    return result;
+  }
+
+  template <typename T>
+  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
+      const ConvArgs& args, const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
+    size_t workspace_size_limit =
+        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
+    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
+    VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
+            << " MB";
+
+    int returned_algo_count;
+    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
+    auto cudnn_find_func = [&](void* workspace_ptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
+              args.handle, args.wdesc.desc(), args.w->data<T>(),
+              args.odesc.desc(), args.o->data<T>(), args.cdesc.desc(),
+              args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
+              kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
+              perf_results.data(), workspace_ptr, max_workspace_size));
+    };
+
+    auto workspace_handle = ctx.cudnn_workspace_handle();
+    workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
+                                 UseFixedWorkspace());
+
+    VLOG(4) << GetPerfResultString<PerfT>(
+        "[Exhaustive Search] BwdDataAlgo Perf result", perf_results,
+        returned_algo_count, workspace_size_limit);
+    ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                        &result);
+
+    return result;
+  }
+
+  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
+                                    size_t workspace_size_limit) {
     if (!UseFixedWorkspace()) {
       size_t max_workspace_size = 0;
       for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
@@ -438,6 +518,10 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
   }
 };
 
+// cuDNN convution backward filter-algorithm searcher, consisted of three
+// algorithm searching modes, namely: deterministic, heuristic, and
+// exhaustive_search mode. As well as one workspace size acquirsition function
+// with respect to the chosen alogrithm.
 template <>
 struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
@@ -450,113 +534,30 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     platform::CUDAGraphCaptureModeGuard guard;
     SearchResult<AlgoT> result;
     auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
     SetConvMathType(ctx, dtype, args.cdesc);
 
-    if (!exhaustive_search && !deterministic) {
-#if CUDNN_VERSION >= 7001
-      int actual_perf_count;
-      int best_algo_idx = 0;
-      std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-              args.handle, args.idesc.desc(), args.odesc.desc(),
-              args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
-              &actual_perf_count, perf_results.data()));
-      result.algo = perf_results[best_algo_idx].algo;
-      result.workspace_size = perf_results[best_algo_idx].memory;
-
-      if (result.workspace_size > workspace_size_limit) {
-#if CUDNN_VERSION >= 8000
-        // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
-        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
-                                            &result);
-#else
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << result.workspace_size << ") exceeds the limit("
-                << workspace_size_limit << ")";
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-                args.handle, args.idesc.desc(), args.odesc.desc(),
-                args.cdesc.desc(), args.wdesc.desc(),
-                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &(result.algo)));
-#endif
-      }
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-              args.handle, args.idesc.desc(), args.odesc.desc(),
-              args.cdesc.desc(), args.wdesc.desc(),
-              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &(result.algo)));
-#endif
-    } else if (deterministic) {
-      result.algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+    if (deterministic) {
+      result = FindAlgoDeterministic();
     } else {
-      auto workspace_handle = ctx.cudnn_workspace_handle();
-      auto x_dims = phi::vectorize(args.x->dims());
-      auto w_dims = phi::vectorize(args.w->dims());
-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
-               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-      AlgorithmsCache<AlgoT>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetBackwardFilter());
-
-      if (dtype != CUDNN_DATA_HALF) {
-        result.algo = algo_cache.GetAlgorithm(
-            x_dims, w_dims, args.s, args.p, args.d, 0,
-            static_cast<int64_t>(args.cudnn_dtype), [&]() {
-              int returned_algo_count;
-              std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
-              size_t max_workspace_size =
-                  FindMaxWorkspaceSize(args, workspace_size_limit);
-              VLOG(3) << "max_workspace_size="
-                      << ToMegaBytes(max_workspace_size) << " MB";
-
-              auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::
-                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                            args.handle, args.idesc.desc(), args.x->data<T>(),
-                            args.odesc.desc(), args.o->data<T>(),
-                            args.cdesc.desc(), args.wdesc.desc(),
-                            const_cast<T*>(args.w->data<T>()),
-                            kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                            perf_results.data(), cudnn_workspace_ptr,
-                            max_workspace_size));
-              };
-              workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
-                                           UseFixedWorkspace());
-
-              VLOG(3) << GetPerfResultString<PerfT>(
-                  "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
-                  returned_algo_count, workspace_size_limit);
-              result.time = perf_results[0].time;
-              return perf_results[0].algo;
-            });
+      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
+      // 2. Once turning on auto-tune, runn heuristic search(default) before
+      //    auto-tune process, run exhaustive_search during mentioned process.
+      // 3. After auto-tune process, run cached algorithm if cached, run
+      //    default mode for the rest.
+      size_t key = args.GetCacheKey<T>();
+      auto& cache =
+          phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
+      if (cache.Find(key)) {
+        result.algo = static_cast<AlgoT>(cache.Get(key));
       } else {
-        result.algo = algo_cache.GetAlgorithm(
-            x_dims, w_dims, args.s, args.p, args.d, 0,
-            static_cast<int64_t>(args.cudnn_dtype), [&]() {
-              SearchResult<AlgoT> algo_result;
-              int actual_algos = 0;
-              std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
-
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::
-                      cudnnFindConvolutionBackwardFilterAlgorithm(
-                          args.handle, args.idesc.desc(), args.odesc.desc(),
-                          args.cdesc.desc(), args.wdesc.desc(),
-                          perf_results.size(), &actual_algos,
-                          perf_results.data()));
-              perf_results.resize(actual_algos);
-              ChooseAlgo(perf_results, workspace_size_limit, &algo_result);
-              result.time = algo_result.time;
-              return algo_result.algo;
-            });
+        bool use_autotune =
+            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
+        if (exhaustive_search || use_autotune) {
+          result = FindAlgoExhaustiveSearch<T>(args, ctx);
+          cache.Set(key, static_cast<int64_t>(result.algo));
+        } else {
+          result = FindAlgoHeuristic(args, ctx);
+        }
       }
     }
     VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
@@ -578,8 +579,126 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   }
 
  private:
-  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
-                                     size_t workspace_size_limit) {
+  static SearchResult<AlgoT> FindAlgoDeterministic() {
+    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
+  }
+
+  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
+                                               const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
+    size_t workspace_size_limit =
+        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
+
+#if CUDNN_VERSION >= 7001
+    int actual_perf_count;
+    int best_algo_idx = 0;
+    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+            args.handle, args.idesc.desc(), args.odesc.desc(),
+            args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
+            &actual_perf_count, perf_results.data()));
+    result.algo = perf_results[best_algo_idx].algo;
+    result.workspace_size = perf_results[best_algo_idx].memory;
+
+    if (result.workspace_size > workspace_size_limit) {
+#if CUDNN_VERSION >= 8000
+      // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
+      ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                          &result);
+#else
+      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                 "the workspace size request("
+              << result.workspace_size << ") exceeds the limit("
+              << workspace_size_limit << ")";
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              args.handle, args.idesc.desc(), args.odesc.desc(),
+              args.cdesc.desc(), args.wdesc.desc(),
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &(result.algo)));
+#endif
+    }
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+            args.handle, args.idesc.desc(), args.odesc.desc(),
+            args.cdesc.desc(), args.wdesc.desc(),
+            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+            workspace_size_limit, &(result.algo)));
+#endif
+
+    return result;
+  }
+
+  template <typename T>
+  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
+      const ConvArgs& args, const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
+    int returned_algo_count = 0;
+    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
+    size_t workspace_size_limit =
+        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
+    auto workspace_handle = ctx.cudnn_workspace_handle();
+    if (platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
+      size_t max_workspace_size =
+          GetMaxWorkspaceSize(args, workspace_size_limit);
+      VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
+              << " MB";
+
+      auto cudnn_find_func = [&](void* workspace_ptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                args.handle, args.idesc.desc(), args.x->data<T>(),
+                args.odesc.desc(), args.o->data<T>(), args.cdesc.desc(),
+                args.wdesc.desc(), const_cast<T*>(args.w->data<T>()),
+                kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
+                perf_results.data(), workspace_ptr, max_workspace_size));
+      };
+      workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
+                                   UseFixedWorkspace());
+
+      VLOG(4) << GetPerfResultString<PerfT>(
+          "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
+          returned_algo_count, workspace_size_limit);
+      ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                          &result);
+    } else {
+      int max_algos = GetAlgorithmMaxCount(args.handle);
+      std::vector<PerfT> perf_results(max_algos);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
+              args.handle, args.idesc.desc(), args.odesc.desc(),
+              args.cdesc.desc(), args.wdesc.desc(), perf_results.size(),
+              &returned_algo_count, perf_results.data()));
+      perf_results.resize(returned_algo_count);
+
+      VLOG(4) << GetPerfResultString<PerfT>(
+          "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
+          perf_results.size(), workspace_size_limit);
+      ChooseAlgo(perf_results, workspace_size_limit, &result);
+    }
+
+    return result;
+  }
+
+  static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    int max_algos = 0;
+    auto status =
+        platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+            handle, &max_algos);
+    if (status == gpuSuccess) {
+      VLOG(5) << "[BackwardFilter] max_algos: predefined="
+              << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
+      return max_algos;
+    }
+#endif
+    return kNUM_CUDNN_BWD_FILTER_ALGS;
+  }
+
+  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
+                                    size_t workspace_size_limit) {
     if (!UseFixedWorkspace()) {
       size_t max_workspace_size = 0;
       for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
@@ -604,10 +723,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   static void ChooseAlgo(const std::vector<PerfT>& perf_results,
                          size_t workspace_limit,
                          SearchResult<AlgoT>* algo_result) {
-    VLOG(3) << GetPerfResultString<PerfT>(
-        "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
-        perf_results.size(), workspace_limit);
-
     for (size_t i = 0; i != perf_results.size(); ++i) {
       const auto& result = perf_results[i];
       if (result.status == CUDNN_STATUS_SUCCESS &&
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 18b53563cd64eb6807f8198ff2d2cef67cf5d4b5..a43eaa41cfe83c2ca635fca93c091020ed43dc38 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -774,3 +774,12 @@ DEFINE_bool(enable_ins_parser_file, false,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 #endif
+
+/**
+ * Autotune related FLAG
+ * Name: FLAGS_use_autotune
+ * Since Version: 2.3.0
+ * Value Range: bool, default=false
+ * Example:
+ */
+PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 982bf7646125b853b45708500825a35e8966ec85..45fcd2fad98a851e151375c8cb03ba5e006ee6b6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -4469,7 +4469,7 @@ All parameter, weight, gradient are variables in Paddle.
     return phi::autotune::AutoTuneStatus::Instance().DisableAutoTune();
   });
 
-  m.def("autotune_range", [](int64_t start, int64_t stop) {
+  m.def("set_autotune_range", [](int64_t start, int64_t stop) {
     return phi::autotune::AutoTuneStatus::Instance().SetAutoTuneRange(start,
                                                                       stop);
   });
@@ -4478,10 +4478,8 @@ All parameter, weight, gradient are variables in Paddle.
         [] { return phi::autotune::AutoTuneStatus::Instance().Update(); });
 
   m.def("autotune_status", [] {
-    phi::autotune::AutoTuneCache::Instance().UpdateStatus();
     py::dict res;
-    res["use_autotune"] =
-        phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
+    phi::autotune::AutoTuneCache::Instance().UpdateStatus();
     res["step_id"] = phi::autotune::AutoTuneStatus::Instance().StepID();
     res["cache_size"] = phi::autotune::AutoTuneCache::Instance().Size();
     res["cache_hit_rate"] =
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 937024d450a36368fa9d4d41369260e9244330a8..eec83a1ed8130c51cd633267035b7d228df0a1ce 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -6,12 +6,15 @@ file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h
 # phi functors and functions called by kernels
 add_subdirectory(funcs)
 
+# kernel autotune
+add_subdirectory(autotune)
+
 # phi depends all phi kernel targets
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 # [ 1. Common kernel compilation dependencies ]
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor )
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
@@ -27,12 +30,16 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel
+set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel conv_transpose_kernel conv_transpose_grad_kernel)
+set(MANUAL_BUILD_KERNELS ${AUTOTUNE_KERNELS} cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel
     gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
     matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
     put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
     softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
     triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel)
+foreach(src ${AUTOTUNE_KERNELS})
+  kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
+endforeach()
 kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper)
 kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel)
 kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy)
@@ -75,6 +82,3 @@ add_subdirectory(selected_rows)
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 # For strings kernels
 add_subdirectory(strings)
-
-# 5. kernel autotune
-add_subdirectory(autotune)
diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt
index f1702d883b9f03bb217fb58c0b093efce206c2d8..63dc22459446f6cd7a29c8866d840177c767b334 100644
--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
@@ -1,11 +1,12 @@
 if (WITH_GPU)
-    nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
-    nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
+  nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+  nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
 elseif (WITH_ROCM)
-    hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
-    hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
+  hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+  hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
 endif()
 
 cc_library(cache SRCS cache.cc DEPS boost)
+cc_library(switch_autotune SRCS switch_autotune.cc DEPS cache flags)
 
 cc_test(cache_test SRCS cache_test.cc DEPS gtest cache)
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
index bf68e2010151b28b6a0afab5078520ec8df4a31e..ef2cbe633d4965d37a23c790ad605607c76e12ae 100644
--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/autotune/cache.h"
+#include <iomanip>
+#include "glog/logging.h"
 
 namespace phi {
 namespace autotune {
@@ -32,5 +34,40 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
                 static_cast<int64_t>(dtype));
 }
 
+std::string AlgorithmTypeString(int64_t algo_type) {
+  if (algo_type == static_cast<int64_t>(AlgorithmType::kConvForward)) {
+    return "conv_forward";
+  } else if (algo_type ==
+             static_cast<int64_t>(AlgorithmType::kConvBackwardData)) {
+    return "conv_backward_data";
+  } else if (algo_type ==
+             static_cast<int64_t>(AlgorithmType::kConvBackwardFilter)) {
+    return "conv_backward_filter";
+  }
+  return std::to_string(algo_type);
+}
+
+void AutoTuneCache::UpdateStatus() {
+  int64_t size = 0;
+  int64_t cache_hits = 0;
+  int64_t cache_misses = 0;
+  int name_width = 24;
+  std::cout.setf(std::ios::left);
+  for (auto& v : auto_tune_map_) {
+    VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width)
+            << AlgorithmTypeString(v.first)
+            << " Cache Size: " << v.second.Size()
+            << " Hits: " << v.second.CacheHits()
+            << " Misses: " << v.second.CacheMisses()
+            << " Hit Rate: " << v.second.CacheHitRate();
+    size += v.second.Size();
+    cache_hits += v.second.CacheHits();
+    cache_misses += v.second.CacheMisses();
+  }
+  total_size_ = size;
+  total_cache_hits_ = cache_hits;
+  total_cache_misses_ = cache_misses;
+}
+
 }  // namespace autotune
 }  // namespace phi
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index d492e7c151f916173b6e952619412c81de23c3ff..37c5d134e8a61344996379741248366bce96d72c 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -13,11 +13,12 @@
 // limitations under the License.
 
 #pragma once
+
 #include <algorithm>
 #include <mutex>
+#include <numeric>
 #include <unordered_map>
 #include <vector>
-#include "glog/logging.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
@@ -92,6 +93,13 @@ class AlgorithmsCache {
     return ret;
   }
 
+  void Clean() {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    hash_.clear();
+    cache_hits_ = 0;
+    cache_misses_ = 0;
+  }
+
   void Set(size_t key, AlgorithmT algo) {
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     hash_[key] = algo;
@@ -116,15 +124,22 @@ class AlgorithmsCache {
  private:
   std::unordered_map<size_t, AlgorithmT> hash_;
   std::shared_ptr<std::mutex> cache_mutex_;
-  int64_t cache_hits_ = 0;
-  int64_t cache_misses_ = 0;
+
+  int64_t cache_hits_{0};
+  int64_t cache_misses_{0};
+};
+
+enum class AlgorithmType {
+  kConvForward = 1,
+  kConvBackwardData = 2,
+  kConvBackwardFilter = 3,
+  kAlgorithmCount = 4
 };
 
 // AlgorithmsConfigKey -> AlgorithmsID
-using AlgorithmsConfigKeyMap = AlgorithmsCache<int64_t>;
-// AlgorithmsType -> AlgorithmsCache
-using AlgorithmsTypeMap =
-    std::unordered_map<std::string, AlgorithmsConfigKeyMap>;
+using AlgorithmsCacheMap = AlgorithmsCache<int64_t>;
+// AlgorithmType -> AlgorithmsCache
+using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
 
 class AutoTuneCache {
  public:
@@ -133,42 +148,30 @@ class AutoTuneCache {
     return autotune_cache;
   }
 
-  AlgorithmsConfigKeyMap& RegisterOrGet(const std::string& algo_type) {
-    std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
-    if (auto_tune_map_.find(algo_type) == auto_tune_map_.end()) {
-      AlgorithmsConfigKeyMap cache;
-      auto_tune_map_[algo_type] = cache;
-    }
-    return auto_tune_map_[algo_type];
+  AlgorithmsCacheMap& Get(const AlgorithmType& algo_type) {
+    return auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
-  void Clean(float miss_rate) {
-    std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
-    // Set a small tolerance to avoid performance degradation
-    // due to large cache size under dynamic shape.
-    if (miss_rate > 0.01) {
-      auto_tune_map_.clear();
-    }
+  AlgorithmsCacheMap& GetConvForward() {
+    return Get(AlgorithmType::kConvForward);
+  }
+
+  AlgorithmsCacheMap& GetConvBackwardData() {
+    return Get(AlgorithmType::kConvBackwardData);
+  }
+
+  AlgorithmsCacheMap& GetConvBackwardFilter() {
+    return Get(AlgorithmType::kConvBackwardFilter);
   }
 
-  void UpdateStatus() {
-    int64_t size = 0;
-    int64_t cache_hits = 0;
-    int64_t cache_misses = 0;
+  void Clean() {
     for (auto& v : auto_tune_map_) {
-      VLOG(4) << "AlgoType: " << v.first << " Cache Size: " << v.second.Size()
-              << " Hits: " << v.second.CacheHits()
-              << " Misses: " << v.second.CacheMisses()
-              << " Hit Rate: " << v.second.CacheHitRate();
-      size += v.second.Size();
-      cache_hits += v.second.CacheHits();
-      cache_misses += v.second.CacheMisses();
+      v.second.Clean();
     }
-    total_size_ = size;
-    total_cache_hits_ = cache_hits;
-    total_cache_misses_ = cache_misses;
   }
 
+  void UpdateStatus();
+
   // The number of total config cached
   int64_t Size() const { return total_size_; }
 
@@ -183,17 +186,30 @@ class AutoTuneCache {
       total_cache_hit_rate = static_cast<float>(total_cache_hits_) /
                              static_cast<float>(total_num_accesses);
     }
-
     return total_cache_hit_rate;
   }
 
  private:
-  AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) {}
+  AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) {
+    for (int i = 1; i < static_cast<int>(AlgorithmType::kAlgorithmCount); ++i) {
+      Register(static_cast<AlgorithmType>(i));
+    }
+  }
+
+  void Register(const AlgorithmType& algo_type) {
+    std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
+    int64_t key = static_cast<int64_t>(algo_type);
+    if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+      AlgorithmsCacheMap cache;
+      auto_tune_map_[key] = cache;
+    }
+  }
+
   AlgorithmsTypeMap auto_tune_map_;
   std::shared_ptr<std::mutex> autotune_cache_mutex_;
-  int64_t total_cache_hits_ = 0;
-  int64_t total_cache_misses_ = 0;
-  int64_t total_size_ = 0;
+  int64_t total_cache_hits_{0};
+  int64_t total_cache_misses_{0};
+  int64_t total_size_{0};
 };
 
 }  // namespace autotune
diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc
index 92ba411624fc0b1b6bd1237ae8f9b446032249b8..f99f8bfc8b821204b24e70f14805a394d6121300 100644
--- a/paddle/phi/kernels/autotune/cache_test.cc
+++ b/paddle/phi/kernels/autotune/cache_test.cc
@@ -22,7 +22,7 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
 
 TEST(AlgosCache, AlgosCache) {
   auto autotune_cache = phi::autotune::AutoTuneCache::Instance();
-  auto& cache = autotune_cache.RegisterOrGet("conv_fw");
+  auto& cache = autotune_cache.GetConvForward();
 
   std::vector<int64_t> x_shape = {4, 224, 224, 3};
   std::vector<int64_t> w_shape = {32, 3, 3, 3};
diff --git a/paddle/phi/kernels/autotune/switch_autotune.cc b/paddle/phi/kernels/autotune/switch_autotune.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6fda24ef3c8600afeadb820ea46b32101d914e26
--- /dev/null
+++ b/paddle/phi/kernels/autotune/switch_autotune.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DECLARE_bool(use_autotune);
+
+namespace phi {
+namespace autotune {
+
+void AutoTuneStatus::EnableAutoTune() {
+  FLAGS_use_autotune = true;
+  Init();
+}
+
+void AutoTuneStatus::DisableAutoTune() {
+  FLAGS_use_autotune = false;
+  Init();
+}
+
+void AutoTuneStatus::Update() {
+  current_steps_id_ += 1;
+  if (!FLAGS_use_autotune) {
+    return;
+  }
+
+  // This fuction is called when each iter finished.
+  if (current_steps_id_ + 1 < start_step_id_) {
+    use_autotune_ = false;
+  } else if (current_steps_id_ + 1 >= start_step_id_ &&
+             current_steps_id_ + 1 < stop_step_id_) {
+    use_autotune_ = true;
+    AutoTuneCache::Instance().UpdateStatus();
+    step_hit_rates_.push_back(StepHitRate());
+    VLOG(3) << "Step ID: " << current_steps_id_
+            << ", Accumulative Cache Hit Rate: "
+            << static_cast<int>(AutoTuneCache::Instance().CacheHitRate() * 100)
+            << "%, Cache Size: " << AutoTuneCache::Instance().Size()
+            << ", Current Step Hit Rate: "
+            << static_cast<int>(StepHitRate() * 100) << "%";
+  } else {
+    use_autotune_ = false;
+    // Set a small tolerance to avoid performance degradation
+    // due to large cache size under dynamic shape.
+    // TODO(limingshu): Currently works for conv op only, this
+    // method shall be opimized when more ops involved in.
+    // float miss_rate = static_cast<float>(1) - RecentHitRate();
+    // if (current_steps_id_ == stop_step_id_) {
+    //   AutoTuneCache::Instance().Clean(miss_rate);
+    // }
+    if (VLOG_IS_ON(4)) {
+      AutoTuneCache::Instance().UpdateStatus();
+      VLOG(4) << "Step ID: " << current_steps_id_ << ", Current Step Hit Rate: "
+              << static_cast<int>(StepHitRate() * 100) << "%";
+    }
+  }
+}
+
+}  // namespace autotune
+}  // namespace phi
diff --git a/paddle/phi/kernels/autotune/switch_autotune.h b/paddle/phi/kernels/autotune/switch_autotune.h
index 2f9621ed2079e497f389efd1dd74f46a255fe842..1793940542d4790ee6a5d020bf7deae22a50bb02 100644
--- a/paddle/phi/kernels/autotune/switch_autotune.h
+++ b/paddle/phi/kernels/autotune/switch_autotune.h
@@ -13,10 +13,8 @@
 // limitations under the License.
 
 #pragma once
+
 #include <cmath>
-#include <mutex>
-#include <numeric>
-#include "glog/logging.h"
 #include "paddle/phi/kernels/autotune/cache.h"
 
 namespace phi {
@@ -31,45 +29,11 @@ class AutoTuneStatus {
 
   bool UseAutoTune() { return use_autotune_; }
 
-  // EnableAutoTune and DisableAutoTune Should be used for debug only.
-  void EnableAutoTune() {
-    use_autotune_ = true;
-    Init();
-  }
-
-  void DisableAutoTune() {
-    use_autotune_ = false;
-    Init();
-  }
+  // EnableAutoTune and DisableAutoTune should be used for debug only.
+  void EnableAutoTune();
+  void DisableAutoTune();
 
-  void Update() {
-    current_steps_id_ += 1;
-
-    if (!use_autotune_ && !update_use_autotune_) {
-      return;
-    }
-
-    if (current_steps_id_ < start_step_id_) {
-      use_autotune_ = false;
-    } else if (current_steps_id_ >= start_step_id_ &&
-               current_steps_id_ < stop_step_id_) {
-      use_autotune_ = true;
-      AutoTuneCache::Instance().UpdateStatus();
-      step_hit_rates_.push_back(StepHitRate());
-      VLOG(3) << "Step ID " << current_steps_id_
-              << ", Accumulative Cache Hit Rate: "
-              << AutoTuneCache::Instance().CacheHitRate()
-              << ", Cache Size: " << AutoTuneCache::Instance().Size()
-              << ", Current Step Hit Rate: " << StepHitRate();
-    } else if (current_steps_id_ == stop_step_id_) {
-      use_autotune_ = false;
-      update_use_autotune_ = false;
-      // clean cache according miss rate
-      float miss_rate = static_cast<float>(1) - RecentHitRate();
-      AutoTuneCache::Instance().Clean(miss_rate);
-      VLOG(3) << "Recent Miss Rate: " << miss_rate;
-    }
-  }
+  void Update();
 
   int64_t StepID() { return current_steps_id_; }
 
@@ -84,19 +48,25 @@ class AutoTuneStatus {
 
   // Hit Rate of Current Step
   float StepHitRate() {
-    int64_t current_hits = AutoTuneCache::Instance().CacheHits();
-    int64_t current_misses = AutoTuneCache::Instance().CacheMisses();
-    int64_t step_hits_ = current_hits - previous_hits_;
-    int64_t step_misses_ = current_misses - previous_misses_;
-    float step_hit_rate = 0.;
-    int64_t step_num_accesses = step_hits_ + step_misses_;
-    if (step_num_accesses != 0) {
-      step_hit_rate = static_cast<float>(step_hits_) /
-                      static_cast<float>(step_num_accesses);
+    static int64_t last_step_id = -2;
+
+    if (last_step_id != current_steps_id_) {
+      int64_t current_hits = AutoTuneCache::Instance().CacheHits();
+      int64_t current_misses = AutoTuneCache::Instance().CacheMisses();
+      int64_t step_hits_ = current_hits - previous_hits_;
+      int64_t step_misses_ = current_misses - previous_misses_;
+      float step_hit_rate = 0.;
+      int64_t step_num_accesses = step_hits_ + step_misses_;
+      if (step_num_accesses != 0) {
+        step_hit_rate = static_cast<float>(step_hits_) /
+                        static_cast<float>(step_num_accesses);
+      }
+      previous_hits_ = current_hits;
+      previous_misses_ = current_misses;
+      current_step_hit_rate_ = step_hit_rate;
+      last_step_id = current_steps_id_;
     }
-    previous_hits_ = current_hits;
-    previous_misses_ = current_misses;
-    return step_hit_rate;
+    return current_step_hit_rate_;
   }
 
   void SetAutoTuneRange(int64_t start, int64_t stop) {
@@ -108,21 +78,21 @@ class AutoTuneStatus {
   AutoTuneStatus() = default;
 
   void Init() {
-    update_use_autotune_ = use_autotune_;
+    use_autotune_ = false;
     current_steps_id_ = -1;
     previous_hits_ = 0;
     previous_misses_ = 0;
     step_hit_rates_.clear();
-    AutoTuneCache::Instance().Clean(1.0);
+    AutoTuneCache::Instance().Clean();
   }
 
-  int64_t start_step_id_ = 0;
-  int64_t stop_step_id_ = 10;
-  int64_t current_steps_id_ = -1;
-  bool use_autotune_ = false;
-  bool update_use_autotune_ = false;
-  int64_t previous_hits_ = 0;
-  int64_t previous_misses_ = 0;
+  bool use_autotune_{false};
+  int64_t start_step_id_{1};
+  int64_t stop_step_id_{10};
+  int64_t current_steps_id_{-1};
+  int64_t previous_hits_{0};
+  int64_t previous_misses_{0};
+  float current_step_hit_rate_{0.f};
   std::vector<float> step_hit_rates_;
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
index 1c08811d4b95c53870042d455b222b49cd2742b1..1775272aac69d0acc4cdc1128f3a94da796dbc2b 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
@@ -14,7 +14,7 @@
 
 import paddle
 import unittest
-import numpy
+import numpy as np
 
 
 class SimpleNet(paddle.nn.Layer):
@@ -27,6 +27,7 @@ class SimpleNet(paddle.nn.Layer):
 
 
 def train_dygraph(net, data):
+    data.stop_gradient = False
     out = net(data)
     loss = paddle.mean(out)
     adam = paddle.optimizer.Adam(parameters=net.parameters())
@@ -36,6 +37,7 @@ def train_dygraph(net, data):
 
 
 def static_program(net, data):
+    data.stop_gradient = False
     out = net(data)
     loss = paddle.mean(out)
     adam = paddle.optimizer.Adam()
@@ -43,60 +45,64 @@ def static_program(net, data):
     return loss
 
 
-def set_flags(enable_autotune):
-    if paddle.is_compiled_with_cuda():
-        if enable_autotune:
-            paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1})
-            paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 1})
-        else:
-            paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512})
-            paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 0})
-
-
 class TestAutoTune(unittest.TestCase):
+    def set_flags(self, enable_autotune):
+        if paddle.is_compiled_with_cuda():
+            if enable_autotune:
+                paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1})
+            else:
+                paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512})
+
+    def get_flags(self, name):
+        res = paddle.get_flags(name)
+        return res[name]
+
+    def get_expected_res(self, step_id, enable_autotune):
+        expected_res = {
+            "step_id": step_id,
+            "cache_size": 0,
+            "cache_hit_rate": 0
+        }
+        if paddle.is_compiled_with_cuda():
+            # Total 3 * num_iters cache accesses, only iter 2 hits the cache.
+            if enable_autotune and step_id >= 1:
+                expected_res["cache_size"] = 3
+            if enable_autotune and step_id == 2:
+                expected_res["cache_hit_rate"] = np.round(
+                    float(3) / float(9), 5)
+        return expected_res
+
     def test_autotune(self):
         paddle.fluid.core.disable_autotune()
-        status = paddle.fluid.core.autotune_status()
-        self.assertEqual(status["use_autotune"], False)
+        self.assertEqual(self.get_flags("FLAGS_use_autotune"), False)
 
         paddle.fluid.core.enable_autotune()
-        status = paddle.fluid.core.autotune_status()
-        self.assertEqual(status["use_autotune"], True)
+        self.assertEqual(self.get_flags("FLAGS_use_autotune"), True)
 
     def check_status(self, expected_res):
         status = paddle.fluid.core.autotune_status()
         for key in status.keys():
-            self.assertEqual(status[key], expected_res[key])
+            if key == "cache_hit_rate":
+                v = np.round(status[key], 5)
+            else:
+                v = status[key]
+            self.assertEqual(v, expected_res[key])
 
 
 class TestDygraphAutoTuneStatus(TestAutoTune):
     def run_program(self, enable_autotune):
-        set_flags(enable_autotune)
+        self.set_flags(enable_autotune)
         if enable_autotune:
             paddle.fluid.core.enable_autotune()
         else:
             paddle.fluid.core.disable_autotune()
-        paddle.fluid.core.autotune_range(1, 2)
+        paddle.fluid.core.set_autotune_range(1, 2)
         x_var = paddle.uniform((1, 1, 8, 8), dtype='float32', min=-1., max=1.)
         net = SimpleNet()
         for i in range(3):
             train_dygraph(net, x_var)
-            if i >= 1 and i < 2:
-                expected_res = {
-                    "step_id": i,
-                    "use_autotune": enable_autotune,
-                    "cache_size": 0,
-                    "cache_hit_rate": 0
-                }
-                self.check_status(expected_res)
-            else:
-                expected_res = {
-                    "step_id": i,
-                    "use_autotune": False,
-                    "cache_size": 0,
-                    "cache_hit_rate": 0
-                }
-                self.check_status(expected_res)
+            expected_res = self.get_expected_res(i, enable_autotune)
+            self.check_status(expected_res)
 
     def func_enable_autotune(self):
         self.run_program(enable_autotune=True)
@@ -118,60 +124,45 @@ class TestDygraphAutoTuneStatus(TestAutoTune):
 class TestStaticAutoTuneStatus(TestAutoTune):
     def run_program(self, enable_autotune):
         paddle.enable_static()
-        set_flags(enable_autotune)
-        if enable_autotune:
-            paddle.fluid.core.enable_autotune()
-        else:
-            paddle.fluid.core.disable_autotune()
-        paddle.fluid.core.autotune_range(1, 2)
 
         data_shape = [1, 1, 8, 8]
-        data = paddle.static.data(name='X', shape=data_shape, dtype='float32')
-        net = SimpleNet()
-        loss = static_program(net, data)
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            data = paddle.static.data(
+                name='X', shape=data_shape, dtype='float32')
+            net = SimpleNet()
+            loss = static_program(net, data)
         place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda(
         ) else paddle.CPUPlace()
         exe = paddle.static.Executor(place)
-        exe.run(paddle.static.default_startup_program())
-        x = numpy.random.random(size=data_shape).astype('float32')
+        exe.run(startup_program)
+        x = np.random.random(size=data_shape).astype('float32')
+
+        self.set_flags(enable_autotune)
+        if enable_autotune:
+            paddle.fluid.core.enable_autotune()
+        else:
+            paddle.fluid.core.disable_autotune()
+        paddle.fluid.core.set_autotune_range(1, 2)
 
         for i in range(3):
-            exe.run(feed={'X': x}, fetch_list=[loss])
+            exe.run(program=main_program, feed={'X': x}, fetch_list=[loss])
             status = paddle.fluid.core.autotune_status()
-            # In static mode, the startup_program will run at first.
-            # The expected step_id will be increased by 1.
-            if i >= 0 and i < 1:
-                expected_res = {
-                    "step_id": i + 1,
-                    "use_autotune": enable_autotune,
-                    "cache_size": 0,
-                    "cache_hit_rate": 0
-                }
-                self.check_status(expected_res)
-            else:
-                expected_res = {
-                    "step_id": i + 1,
-                    "use_autotune": False,
-                    "cache_size": 0,
-                    "cache_hit_rate": 0
-                }
-                self.check_status(expected_res)
+            expected_res = self.get_expected_res(i, enable_autotune)
+            self.check_status(expected_res)
         paddle.disable_static()
 
     def func_enable_autotune(self):
         self.run_program(enable_autotune=True)
 
     def test_enable_autotune(self):
-        with paddle.fluid.framework._test_eager_guard():
-            self.func_enable_autotune()
         self.func_enable_autotune()
 
     def func_disable_autotune(self):
         self.run_program(enable_autotune=False)
 
     def test_disable_autotune(self):
-        with paddle.fluid.framework._test_eager_guard():
-            self.func_disable_autotune()
         self.func_disable_autotune()