support Exhaustive search in dygraph (#23415)

* use global conv cache; test=develop * use singleton cache; test=develop * fix format error; test=develop * add cudnn helper header; test=develop * fix header error; test=develop * fix mac unitest; test=develop * fix mac unitest; test=develop * fix file format; test=develop * fix include file error, test=develop * remove kernel_configs_ in class ExecutionContext and kernel_configs_map_ in class OperatorWithKernel, test=develop * fix test_elementwise_mul_op_dim, test=develop * fix compile error, test=develop Co-authored-by: N phlrain <phliuhongyu@126.com>

support Exhaustive search in dygraph (#23415)
* use global conv cache; test=develop * use singleton cache; test=develop * fix format error; test=develop * add cudnn helper header; test=develop * fix header error; test=develop * fix mac unitest; test=develop * fix mac unitest; test=develop * fix file format; test=develop * fix include file error, test=develop * remove kernel_configs_ in class ExecutionContext and kernel_configs_map_ in class OperatorWithKernel, test=develop * fix test_elementwise_mul_op_dim, test=develop * fix compile error, test=develop Co-authored-by: N phlrain <phliuhongyu@126.com>
dbfbd7ea · zhongpu · GitHub · 01d7ccd4 · dbfbd7ea · dbfbd7ea
16 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -905,16 +905,6 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
  this->InferShape(&infer_shape_ctx);
 }

-std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
-    const OpKernelType& key) const {
-  auto config_iter = kernel_configs_map_.find(key);
-  std::vector<KernelConfig>* kernel_configs = nullptr;
-  if (config_iter != kernel_configs_map_.end()) {
-    kernel_configs = &(config_iter->second);
-  }
-  return kernel_configs;
-}
-
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -951,8 +941,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    ChooseKernel(*runtime_ctx, scope, place);
  }

-  std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);
-
  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
  Scope* transfer_scope = nullptr;
@@ -988,8 +976,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  {
    platform::RecordEvent record_event("compute",
                                       platform::EventRole::kInnerOp);
-    (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
-                                     kernel_configs));
+    (*kernel_func_)(
+        ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
  }

  if (!transfered_inplace_vars.empty()) {
@@ -1058,7 +1046,7 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
  OpKernelMap& kernels = kernels_iter->second;

  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+      ExecutionContext(*this, scope, *dev_ctx, ctx));
  if (HasAttr("op_device")) {
    if (Attr<std::string>("op_device") == "cpu") {
      expected_kernel_key.place_ = platform::CPUPlace();

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -31,7 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -216,30 +215,12 @@ class OperatorBase {
                       const platform::Place& place) const = 0;
 };

-#ifdef PADDLE_WITH_CUDA
-using KernelConfig = boost::variant<
-    std::shared_ptr<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>,
-    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>,
-    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>>;
-#else
-using KernelConfig = boost::variant<boost::blank>;
-#endif
-
-using OpKernelConfigsMap =
-    std::unordered_map<OpKernelType, std::vector<KernelConfig>,
-                       OpKernelType::Hash>;
-
 class ExecutionContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
                   const platform::DeviceContext& device_context,
-                   const RuntimeContext& ctx,
-                   std::vector<KernelConfig>* configs)
-      : op_(op),
-        scope_(scope),
-        device_context_(device_context),
-        ctx_(ctx),
-        kernel_configs_(configs) {}
+                   const RuntimeContext& ctx)
+      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
  virtual ~ExecutionContext() {}

  virtual std::string InputName(const std::string& name) const {
@@ -405,15 +386,6 @@ class ExecutionContext {
    return temp_tensor;
  }

-  template <typename T>
-  T& GetKernelConfig(size_t idx) const {
-    PADDLE_ENFORCE(
-        kernel_configs_ && kernel_configs_->size() > static_cast<size_t>(idx),
-        "%s selected kernel doesn't have kernel config %lu <= %lu",
-        op_.Type().c_str(), kernel_configs_->size(), idx);
-    return *boost::get<std::shared_ptr<T>>((*kernel_configs_)[idx]);
-  }
-
  const RuntimeContext Context() const { return ctx_; }

  std::string DebugString() const { return op_.DebugString(); }
@@ -423,7 +395,6 @@ class ExecutionContext {
  const Scope& scope_;
  const platform::DeviceContext& device_context_;
  const RuntimeContext& ctx_;
-  mutable std::vector<KernelConfig>* kernel_configs_;
 };

 template <>
@@ -499,8 +470,6 @@ class OperatorWithKernel : public OperatorBase {

  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;

-  std::vector<KernelConfig>* GetKernelConfig(const OpKernelType& key) const;
-
  // change this to public so that in dygraph mode we can call it to check if we
  // need transform data
  virtual OpKernelType GetKernelTypeForVar(
@@ -537,7 +506,6 @@ class OperatorWithKernel : public OperatorBase {
                    const platform::Place& place) const;

 protected:
-  mutable OpKernelConfigsMap kernel_configs_map_;
  mutable std::unique_ptr<OpKernelType> kernel_type_;
  mutable std::unique_ptr<OpKernelFunc> kernel_func_;
  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;

--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -21,19 +21,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-// Not thread-safe. Should be owned per-kernel.
+// thread-safe.
 template <typename TAlgorithm>
 class AlgorithmsCache {
 public:
  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
  // Caches the best algorithm for a given
  // combination of tensor dimensions & compute data type.
-  TAlgorithm GetAlgorithm(
-      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const std::vector<int>& dilations,
-      int algorithmFlags,  // can set for different data type
-      std::function<TAlgorithm()> gen_func);
+  // cudnn_dtype set for different data type
+  TAlgorithm GetAlgorithm(const std::vector<int64_t>& dims1,
+                          const std::vector<int64_t>& dims2,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations, int algorithmFlags,
+                          int64_t cudnn_dtype,
+                          std::function<TAlgorithm()> gen_func);

  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
                          std::function<TAlgorithm()> gen_func);
@@ -41,13 +43,14 @@ class AlgorithmsCache {
 private:
  std::unordered_map<int64_t, TAlgorithm> hash_;
  int search_times_;
+  std::mutex cache_mutex;
 };

 template <typename TAlgorithm>
 TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
    const std::vector<int>& strides, const std::vector<int>& paddings,
-    const std::vector<int>& dilations, int algorithmFlags,
+    const std::vector<int>& dilations, int algorithmFlags, int64_t cudnn_dtype,
    std::function<TAlgorithm()> gen_func) {
  int64_t seed = 0;
  // Hash all of the inputs, use to try and look up a previously
@@ -81,36 +84,73 @@ TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
          (seed << 6) + (seed >> 2) + 5;

+  seed ^= hashFn(static_cast<int64_t>(cudnn_dtype)) + 0x9e3779b9 + (seed << 6) +
+          (seed >> 2) + 6;
+
  VLOG(10) << "seed:" << seed << ", hash_.size:" << hash_.size();

  if (seed == 0) return gen_func();

-  if (hash_.find(seed) == hash_.end()) {
-    TAlgorithm value = gen_func();
-    hash_[seed] = value;
+  TAlgorithm ret;
+  auto it = hash_.end();
+  bool have_found = false;
+  {
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    it = hash_.find(seed);
+
+    if (it != hash_.end()) {
+      ret = it->second;
+      have_found = true;
+    }
+  }
+
+  if (!have_found) {
+    ret = gen_func();
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    hash_[seed] = ret;
  }
-  return hash_[seed];
+
+  return ret;
 }

 template <typename TAlgorithm>
 TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
    int64_t area, int search_times, int algorithmFlags,
    std::function<TAlgorithm()> gen_func) {
-  if (hash_.find(area) != hash_.end()) {
-    return hash_[area];
+  auto it = hash_.end();
+  {
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    it = hash_.find(area);
+
+    if (it != hash_.end()) {
+      return it->second;
+    }
+  }
+
+  bool gene_flag = false;
+
+  {
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    gene_flag = (search_times_ < search_times);
  }
-  if (search_times_ < search_times) {
-    auto algo = gen_func();
+
+  TAlgorithm algo{};
+  if (gene_flag) {
+    algo = gen_func();
+    std::lock_guard<std::mutex> lock(cache_mutex);
    hash_[area] = algo;
    ++search_times_;
    return algo;
  }
-  TAlgorithm algo{};
+
  int64_t min = static_cast<uint64_t>(INT_MAX);
-  for (const auto& m : hash_) {
-    if (m.first < min) {
-      min = m.first;
-      algo = m.second;
+  {
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    for (const auto& m : hash_) {
+      if (m.first < min) {
+        min = m.first;
+        algo = m.second;
+      }
    }
  }
  return algo;

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -525,7 +525,7 @@ TEST(ExecutionContextAttrAndInOut, new_api) {

  paddle::framework::RuntimeContext ctx({}, {});
  paddle::framework::ExecutionContext exe_context(*(op.get()), scope, *dev_ctx,
-                                                  ctx, nullptr);
+                                                  ctx);

  ASSERT_EQ(exe_context.InputSize("input"), 1u);
  ASSERT_EQ(exe_context.OutputSize("output"), 1u);

--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -33,11 +33,10 @@ class DygraphExecutionContext : public framework::ExecutionContext {
                          const framework::Scope& scope,
                          const platform::DeviceContext& device_context,
                          const framework::RuntimeContext& ctx,
-                          std::vector<framework::KernelConfig>* configs,
                          const NameVarMap<VarType>& var_base_map_in,
                          const NameVarMap<VarType>& var_base_map_out,
                          const framework::AttributeMap& attrs)
-      : ExecutionContext(op, scope, device_context, ctx, configs),
+      : ExecutionContext(op, scope, device_context, ctx),
        var_base_map_in_(var_base_map_in),
        var_base_map_out_(var_base_map_out),
        attrs_(attrs) {}

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -80,13 +80,8 @@ void PreparedOp::PrepareData(
 PreparedOp::PreparedOp(const framework::OperatorBase& op,
                       const framework::RuntimeContext& ctx,
                       const framework::OperatorWithKernel::OpKernelFunc& func,
-                       platform::DeviceContext* dev_ctx,
-                       std::vector<framework::KernelConfig>* kernel_configs)
-    : op_(op),
-      ctx_(ctx),
-      func_(func),
-      dev_ctx_(dev_ctx),
-      kernel_configs_(kernel_configs) {}
+                       platform::DeviceContext* dev_ctx)
+    : op_(op), ctx_(ctx), func_(func), dev_ctx_(dev_ctx) {}

 template <typename VarType>
 PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
@@ -111,7 +106,7 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
  framework::RuntimeContext ctx({}, {});
  auto expected_kernel_key =
      op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
-          op, framework::Scope(), *dev_ctx, ctx, nullptr, ins, outs, attrs));
+          op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
@@ -120,8 +115,6 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
    PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
                 KernelTypeToString(expected_kernel_key));
  }
-  std::vector<framework::KernelConfig>* kernel_configs =
-      op.GetKernelConfig(expected_kernel_key);

  if (!(expected_kernel_key.place_ == place)) {
    dev_ctx = pool.Get(expected_kernel_key.place_);
@@ -129,7 +122,7 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
  }

  PrepareDataImpl<VarType>(place, ins, op, expected_kernel_key);
-  return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
+  return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
 }

 PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
@@ -152,10 +145,8 @@ template <typename VarType>
 static void PreparedOpRunImpl(
    const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
    const framework::OperatorWithKernel::OpKernelFunc& func,
-    platform::DeviceContext* dev_ctx,
-    std::vector<framework::KernelConfig>* kernel_configs,
-    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
-    const framework::AttributeMap& attrs) {
+    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs) {
  // TODO(zjl): remove scope in dygraph
  framework::Scope scope;

@@ -163,22 +154,21 @@ static void PreparedOpRunImpl(
  static_cast<const framework::OperatorWithKernel&>(op).InferShape(
      &infer_shape_ctx);

-  func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx,
-                                        kernel_configs, ins, outs, attrs));
+  func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
+                                        attrs));
 }

 void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                     const NameVarMap<VarBase>& outs,
                     const framework::AttributeMap& attrs) {
-  PreparedOpRunImpl<VarBase>(op_, ctx_, func_, dev_ctx_, kernel_configs_, ins,
-                             outs, attrs);
+  PreparedOpRunImpl<VarBase>(op_, ctx_, func_, dev_ctx_, ins, outs, attrs);
 }

 void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                     const NameVarMap<VariableWrapper>& outs,
                     const framework::AttributeMap& attrs) {
-  PreparedOpRunImpl<VariableWrapper>(op_, ctx_, func_, dev_ctx_,
-                                     kernel_configs_, ins, outs, attrs);
+  PreparedOpRunImpl<VariableWrapper>(op_, ctx_, func_, dev_ctx_, ins, outs,
+                                     attrs);
 }

 }  // namespace imperative

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -33,8 +33,7 @@ class PreparedOp {
  PreparedOp(const framework::OperatorBase& op,
             const framework::RuntimeContext& ctx,
             const framework::OperatorWithKernel::OpKernelFunc& func,
-             platform::DeviceContext* dev_ctx,
-             std::vector<framework::KernelConfig>* kernel_configs);
+             platform::DeviceContext* dev_ctx);

  static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                            const NameVarMap<VarBase>& outs,
@@ -72,7 +71,6 @@ class PreparedOp {
  const framework::RuntimeContext& ctx_;
  framework::OperatorWithKernel::OpKernelFunc func_;
  platform::DeviceContext* dev_ctx_;
-  std::vector<framework::KernelConfig>* kernel_configs_;
 };

 }  // namespace imperative

--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@@ -235,7 +235,7 @@ TEST(test_layer, test_dygraph_execution_context) {
  framework::Scope scope;

  DygraphExecutionContext<imperative::VarBase> dy_exe_context(
-      *(op.get()), scope, *dev_ctx, ctx, nullptr, ins, outs, concat_att_map);
+      *(op.get()), scope, *dev_ctx, ctx, ins, outs, concat_att_map);

  ASSERT_EQ(dy_exe_context.InputSize("X"), 1u);
  ASSERT_EQ(dy_exe_context.InputName("X"), "vin");

--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
    auto& dev_ctx = *pool.Get(dev_place);

    framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
-    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx, nullptr);
+    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);

    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");

--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
+// #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace operators {

@@ -89,7 +90,43 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
  return out;
 }

-using framework::AlgorithmsCache;
+// ConvSearchCache using framework::AlgorithmsCache to search
+// cudnnConvolutionFwdAlgo_t, cudnnConvolutionBwdDataAlgo_t or
+// cudnnConvolutionBwdFilterAlgo_t
+class ConvSearchCache {
+ public:
+  static ConvSearchCache& Instance() {
+    static ConvSearchCache instance;
+    return instance;
+  }
+
+  framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
+    return &forward_cache_;
+  }
+  framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* GetBackwardData() {
+    return &backward_data_cache_;
+  }
+  framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>*
+  GetBackwardFilter() {
+    return &backward_filter_cache_;
+  }
+  framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetConvFusion() {
+    return &fusion_forward_cache_;
+  }
+
+ private:
+  ConvSearchCache() {}
+  ~ConvSearchCache() {}
+  ConvSearchCache(const ConvSearchCache&) {}
+  ConvSearchCache& operator=(const ConvSearchCache&) {}
+
+  framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
+  framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>
+      backward_data_cache_;
+  framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>
+      backward_filter_cache_;
+  framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t> fusion_forward_cache_;
+};

 struct ConvArgs {
  cudnnHandle_t handle;
@@ -97,6 +134,7 @@ struct ConvArgs {
  platform::FilterDescriptor wdesc;
  platform::ConvolutionDescriptor cdesc;
  const framework::Tensor *x, *w, *o;
+  cudnnDataType_t cudnn_dtype;

  // strides
  std::vector<int> s;
@@ -107,8 +145,9 @@ struct ConvArgs {

  ConvArgs(const framework::Tensor* x, const framework::Tensor* w,
           const framework::Tensor* o, const std::vector<int> s,
-           const std::vector<int> p, const std::vector<int> d)
-      : x(x), w(w), o(o), s(s), p(p), d(d) {}
+           const std::vector<int> p, const std::vector<int> d,
+           cudnnDataType_t dtype)
+      : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
 };

 template <typename perf_t>
@@ -121,7 +160,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {

  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, int algo_cache_id,
+                     bool deterministic,
                     const framework::ExecutionContext& ctx) {
    auto dtype = platform::CudnnDataType<T>::type;
    bool has_got_workspace_size = true;
@@ -183,22 +222,24 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 #endif
      VLOG(3) << "choose algo " << algo;
    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<algo_t>>(algo_cache_id);
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();

+      auto& temp = ctx.cuda_device_context();
+      AlgorithmsCache<algo_t>& algo_cache =
+          *(ConvSearchCache::Instance().GetForward());
+
      auto x_dims = framework::vectorize(args.x->dims());
      auto w_dims = framework::vectorize(args.w->dims());

-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
+      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
+               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
+               << args.s << ", args.p" << args.p << ", args.d" << args.d;

      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
+          x_dims, w_dims, args.s, args.p, args.d, 0,
+          static_cast<int64_t>(args.cudnn_dtype), [&]() {
            int returned_algo_count;
            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;

@@ -244,7 +285,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {

  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, int algo_cache_id,
+                     bool deterministic,
                     const framework::ExecutionContext& ctx) {
    auto dtype = platform::CudnnDataType<T>::type;
    bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
@@ -321,22 +362,23 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
    } else if (deterministic) {
      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<algo_t>>(algo_cache_id);
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();

+      AlgorithmsCache<algo_t>& algo_cache =
+          *(ConvSearchCache::Instance().GetBackwardData());
+
      auto x_dims = framework::vectorize(args.x->dims());
      auto w_dims = framework::vectorize(args.w->dims());

-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
+      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t"
+               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
+               << args.s << ", args.p" << args.p << ", args.d" << args.d;

      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
+          x_dims, w_dims, args.s, args.p, args.d, 0,
+          static_cast<int64_t>(args.cudnn_dtype), [&]() {
            int returned_algo_count;
            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;

@@ -385,7 +427,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {

  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, int algo_cache_id,
+                     bool deterministic,
                     const framework::ExecutionContext& ctx) {
    auto dtype = platform::CudnnDataType<T>::type;
    bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
@@ -449,22 +491,22 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
    } else if (deterministic) {
      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<algo_t>>(algo_cache_id);
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      AlgorithmsCache<algo_t>& algo_cache =
+          *(ConvSearchCache::Instance().GetBackwardFilter());

      auto x_dims = framework::vectorize(args.x->dims());
      auto w_dims = framework::vectorize(args.w->dims());

-      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
-               << algo_cache_id << ", x_dims:" << x_dims
-               << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
-               << args.p << ", args.d" << args.d;
+      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
+               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
+               << args.s << ", args.p" << args.p << ", args.d" << args.d;

      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
+          x_dims, w_dims, args.s, args.p, args.d, 0,
+          static_cast<int64_t>(args.cudnn_dtype), [&]() {
            int returned_algo_count;
            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {

--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -216,9 +216,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    const T* filter_data = transformed_filter_channel.data<T>();

    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_input,  &transformed_filter_channel,
-                  &transformed_output, strides,
-                  padding_common,      dilations};
+    ConvArgs args{&transformed_input,
+                  &transformed_filter_channel,
+                  &transformed_output,
+                  strides,
+                  padding_common,
+                  dilations,
+                  dtype};

    auto handle = dev_ctx.cudnn_handle();
    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
@@ -269,7 +273,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionFwdAlgo_t algo{};

    using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    algo = search::Find<T>(args, exhaustive_search, false, 0, ctx);
+    algo = search::Find<T>(args, exhaustive_search, false, ctx);
    workspace_size = search::GetWorkspaceSize(args, algo);

 #if CUDNN_VERSION_MIN(7, 0, 1)
@@ -518,13 +522,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                   &transformed_output_grad_channel,
                   strides,
                   padding_common,
-                   dilations};
+                   dilations,
+                   dtype};
    ConvArgs args2{&transformed_input,
                   &transformed_filter_grad_channel,
                   &transformed_output_grad_channel,
                   strides,
                   padding_common,
-                   dilations};
+                   dilations,
+                   dtype};

    auto handle = dev_ctx.cudnn_handle();
    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
@@ -580,7 +586,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {

      using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
      data_algo =
-          search1::Find<T>(args1, exhaustive_search, deterministic, 0, ctx);
+          search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
      workspace_size =
          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
    }
@@ -597,7 +603,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {

      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
      filter_algo =
-          search2::Find<T>(args2, exhaustive_search, deterministic, 1, ctx);
+          search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
      workspace_size = std::max(workspace_size,
                                search2::GetWorkspaceSize(args2, filter_algo));
    }
@@ -898,15 +904,26 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {

    auto handle = dev_ctx.cudnn_handle();

-    ConvArgs args1{&transformed_ddX,         W,
-                   &transformed_ddO_channel, strides,
-                   padding_common,           dilations};
-    ConvArgs args2{&transformed_X, ddW,      &transformed_ddO_channel, strides,
-                   padding_common, dilations};
-    ConvArgs args3{&transformed_ddX, dW,       &transformed_dO_channel, strides,
-                   padding_common,   dilations};
-    ConvArgs args4{&transformed_dX, ddW,      &transformed_dO_channel, strides,
-                   padding_common,  dilations};
+    ConvArgs args1{&transformed_ddX,
+                   W,
+                   &transformed_ddO_channel,
+                   strides,
+                   padding_common,
+                   dilations,
+                   dtype};
+    ConvArgs args2{
+        &transformed_X, ddW,  &transformed_ddO_channel, strides, padding_common,
+        dilations,      dtype};
+    ConvArgs args3{&transformed_ddX,
+                   dW,
+                   &transformed_dO_channel,
+                   strides,
+                   padding_common,
+                   dilations,
+                   dtype};
+    ConvArgs args4{
+        &transformed_dX, ddW,  &transformed_dO_channel, strides, padding_common,
+        dilations,       dtype};

    cudnnConvolutionFwdAlgo_t fwd_algo1 =
        static_cast<cudnnConvolutionFwdAlgo_t>(0);
@@ -934,7 +951,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
        args1.cdesc.set(dtype, padding_common, strides, dilations, c_group);

        using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, 0, ctx);
+        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
        workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
      }

@@ -949,7 +966,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
        args2.cdesc.set(dtype, padding_common, strides, dilations, c_group);

        using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, 0, ctx);
+        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
        workspace_size = std::max(workspace_size,
                                  search2::GetWorkspaceSize(args2, fwd_algo2));
      }
@@ -967,7 +984,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {

      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
      filter_algo =
-          search3::Find<T>(args3, exhaustive_search, deterministic, 1, ctx);
+          search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
      workspace_size = std::max(workspace_size,
                                search3::GetWorkspaceSize(args3, filter_algo));
    }
@@ -983,7 +1000,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {

      using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
      data_algo =
-          search4::Find<T>(args4, exhaustive_search, deterministic, 2, ctx);
+          search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
      workspace_size =
          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
    }

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -178,17 +178,6 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(

  auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                      library, customized_type_value);
-#ifdef PADDLE_WITH_CUDA
-  std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
-  // TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
-  // to false. It should be fixed and then here should only create if library
-  // is kCUDNN.
-  if (configs.empty()) {
-    std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p(
-        new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
-    configs.push_back(p);
-  }
-#endif
  return type;
 }

@@ -563,21 +552,6 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
  auto type = framework::OpKernelType(
      OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(),
      layout_, library_, customized_type_value);
-#ifdef PADDLE_WITH_CUDA
-  if (library_ == framework::LibraryType::kCUDNN) {
-    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
-    if (configs.empty()) {
-      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
-          p(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
-      configs.push_back(p);
-
-      std::shared_ptr<
-          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
-          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
-      configs.push_back(p2);
-    }
-  }
-#endif
  return type;
 }

@@ -754,25 +728,6 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
  auto type = framework::OpKernelType(
      OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(),
      layout_, library_, customized_type_value);
-#ifdef PADDLE_WITH_CUDA
-  if (library_ == framework::LibraryType::kCUDNN) {
-    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
-    if (configs.empty()) {
-      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p0(
-          new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
-      configs.push_back(p0);
-
-      std::shared_ptr<
-          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
-          p1(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
-      configs.push_back(p1);
-
-      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
-          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
-      configs.push_back(p2);
-    }
-  }
-#endif
  return type;
 }


--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -221,8 +221,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {

    auto dtype = platform::CudnnDataType<T>::type;
    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_output, filter,   &transformed_input, strides,
-                  padding_common,      dilations};
+    ConvArgs args{&transformed_output,
+                  filter,
+                  &transformed_input,
+                  strides,
+                  padding_common,
+                  dilations,
+                  dtype};
    args.handle = handle;
    args.idesc.set(transformed_output, iwo_groups);
    args.wdesc.set(*filter, layout_tensor, iwo_groups);
@@ -230,7 +235,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
    args.cdesc.set(dtype, padding_common, strides, dilations, c_groups);

    using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    algo = search::Find<T>(args, false, deterministic, 2, ctx);
+    algo = search::Find<T>(args, false, deterministic, ctx);
    workspace_size =
        std::max(workspace_size, search::GetWorkspaceSize(args, algo));

@@ -424,12 +429,23 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
    c_groups = groups;
    groups = 1;
 #endif
-    ConvArgs args1{&transformed_output_grad, filter,
-                   &input_transpose,         strides,
-                   padding_common,           dilations};
-    ConvArgs args2{&transformed_output_grad, filter,
-                   &input_transpose,         strides,
-                   padding_common,           dilations};
+
+    auto dtype = platform::CudnnDataType<T>::type;
+
+    ConvArgs args1{&transformed_output_grad,
+                   filter,
+                   &input_transpose,
+                   strides,
+                   padding_common,
+                   dilations,
+                   dtype};
+    ConvArgs args2{&transformed_output_grad,
+                   filter,
+                   &input_transpose,
+                   strides,
+                   padding_common,
+                   dilations,
+                   dtype};
    cudnnConvolutionFwdAlgo_t data_algo{};
    cudnnConvolutionBwdFilterAlgo_t filter_algo{};

@@ -437,7 +453,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
    size_t workspace_size = 0;
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
-    auto dtype = platform::CudnnDataType<T>::type;
    bool deterministic = FLAGS_cudnn_deterministic;
    T* input_grad_data = nullptr;
    T* filter_grad_data = nullptr;
@@ -454,7 +469,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
      args1.odesc.set(input_transpose, iwo_groups);
      args1.cdesc.set(dtype, padding_common, strides, dilations, c_groups);
      using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search1::Find<T>(args1, false, deterministic, 0, ctx);
+      data_algo = search1::Find<T>(args1, false, deterministic, ctx);
      workspace_size =
          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
    }
@@ -467,7 +482,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
      args2.odesc.set(input_transpose, iwo_groups);
      args2.cdesc.set(dtype, padding_common, strides, dilations, c_groups);
      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search2::Find<T>(args2, false, deterministic, 1, ctx);
+      filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
      workspace_size = std::max(workspace_size,
                                search2::GetWorkspaceSize(args2, filter_algo));
    }

--- a/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
@@ -58,8 +58,7 @@ void MainTest(const TestData& test_data) {

  RuntimeContext runtime_ctx =
      RuntimeContext(op->Inputs(), op->Outputs(), scope);
-  ExecutionContext ctx =
-      ExecutionContext(*op, scope, *dev_ctx, runtime_ctx, nullptr);
+  ExecutionContext ctx = ExecutionContext(*op, scope, *dev_ctx, runtime_ctx);
  bool result = ElementwiseMulOp::AreDimsAndFormatCorrect(
      ctx, 16, MKLDNNMemoryFormat::nChw16c);
  if (test_data.supposed_to_fail)

--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -14,10 +14,10 @@ limitations under the License. */

 #include <array>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/math/padding.h"
-#include "paddle/fluid/platform/cudnn_helper.h"

 DECLARE_int64(cudnn_exhaustive_search_times);

@@ -233,7 +233,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
        return fwd_perf_stat[0].algo;
      };
      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
-          ctx.GetKernelConfig<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>(0);
+          *(ConvSearchCache::Instance().GetConvFusion());
      int search_times = ctx.Attr<int>("search_times");
      search_times = std::max(
          static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
@@ -245,8 +245,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
        algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
                                       search_func);
      } else {
+        auto dtype = platform::CudnnDataType<T>::type;
        algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings,
-                                       dilations, 0, search_func);
+                                       dilations, 0, dtype, search_func);
      }
      VLOG(3) << "choose algo " << algo;
    }

--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -61,8 +61,8 @@ class WarpCTCOp : public framework::OperatorWithKernel {
    framework::LibraryType library_{framework::LibraryType::kPlain};
    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Logits"),
-        ctx.device_context(), layout_, library_);
+        OperatorWithKernel::IndicateVarDataType(ctx, "Logits"), ctx.GetPlace(),
+        layout_, library_);
  }
 };

@@ -174,7 +174,7 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
                                       ctx, framework::GradVarName("Loss")),
-                                   ctx.device_context());
+                                   ctx.GetPlace());
  }
 };